@inproceedings{lu-lin-2023-characterised,
title = "Characterised {LLM}s Affect its Evaluation of Summary and Translation",
author = "Lu, Yuan and
Lin, Yu-Ting",
editor = {Deutsch, Daniel and
Dror, Rotem and
Eger, Steffen and
Gao, Yang and
Leiter, Christoph and
Opitz, Juri and
R{\"u}ckl{\'e}, Andreas},
booktitle = "Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2023",
address = "Bali, Indonesia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eval4nlp-1.15",
doi = "10.18653/v1/2023.eval4nlp-1.15",
pages = "184--192",
abstract = "In today{'}s widespread use of Large Language Models (LLMs), there have been significant achievements in various text domains such as generating summaries and translations. However, there is still room for development and improvement in evaluating the outputs of LLMs. In this paper, we propose an innovative scoring system that assesses the quality of summaries and translations using multiple metrics, we also enhance LLM{'}s performance in scoring tasks by assigning it different roles, effectively making it act as an expert. We test four roles in the study: a teacher, a proofreader, a travel writer, and an internet troll, comparing the advantages and disadvantages of each role in the scoring task. Our research results demonstrate that emphasizing LLM{'}s multilingual capabilities and strict standards as its identity can effectively boost its performance. Additionally, imbuing LLM with a more critical thinking ability enhances its performance in translation tasks compared to a milder LLM identity. In summary, we show that assigning different identities to LLM can influence its performance in scoring tasks. We believe that this research will contribute to the use of LLMs for scoring purposes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lu-lin-2023-characterised">
<titleInfo>
<title>Characterised LLMs Affect its Evaluation of Summary and Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu-Ting</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Leiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juri</namePart>
<namePart type="family">Opitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Rücklé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bali, Indonesia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In today’s widespread use of Large Language Models (LLMs), there have been significant achievements in various text domains such as generating summaries and translations. However, there is still room for development and improvement in evaluating the outputs of LLMs. In this paper, we propose an innovative scoring system that assesses the quality of summaries and translations using multiple metrics, we also enhance LLM’s performance in scoring tasks by assigning it different roles, effectively making it act as an expert. We test four roles in the study: a teacher, a proofreader, a travel writer, and an internet troll, comparing the advantages and disadvantages of each role in the scoring task. Our research results demonstrate that emphasizing LLM’s multilingual capabilities and strict standards as its identity can effectively boost its performance. Additionally, imbuing LLM with a more critical thinking ability enhances its performance in translation tasks compared to a milder LLM identity. In summary, we show that assigning different identities to LLM can influence its performance in scoring tasks. We believe that this research will contribute to the use of LLMs for scoring purposes.</abstract>
<identifier type="citekey">lu-lin-2023-characterised</identifier>
<identifier type="doi">10.18653/v1/2023.eval4nlp-1.15</identifier>
<location>
<url>https://aclanthology.org/2023.eval4nlp-1.15</url>
</location>
<part>
<date>2023-11</date>
<extent unit="page">
<start>184</start>
<end>192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Characterised LLMs Affect its Evaluation of Summary and Translation
%A Lu, Yuan
%A Lin, Yu-Ting
%Y Deutsch, Daniel
%Y Dror, Rotem
%Y Eger, Steffen
%Y Gao, Yang
%Y Leiter, Christoph
%Y Opitz, Juri
%Y Rücklé, Andreas
%S Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems
%D 2023
%8 November
%I Association for Computational Linguistics
%C Bali, Indonesia
%F lu-lin-2023-characterised
%X In today’s widespread use of Large Language Models (LLMs), there have been significant achievements in various text domains such as generating summaries and translations. However, there is still room for development and improvement in evaluating the outputs of LLMs. In this paper, we propose an innovative scoring system that assesses the quality of summaries and translations using multiple metrics, we also enhance LLM’s performance in scoring tasks by assigning it different roles, effectively making it act as an expert. We test four roles in the study: a teacher, a proofreader, a travel writer, and an internet troll, comparing the advantages and disadvantages of each role in the scoring task. Our research results demonstrate that emphasizing LLM’s multilingual capabilities and strict standards as its identity can effectively boost its performance. Additionally, imbuing LLM with a more critical thinking ability enhances its performance in translation tasks compared to a milder LLM identity. In summary, we show that assigning different identities to LLM can influence its performance in scoring tasks. We believe that this research will contribute to the use of LLMs for scoring purposes.
%R 10.18653/v1/2023.eval4nlp-1.15
%U https://aclanthology.org/2023.eval4nlp-1.15
%U https://doi.org/10.18653/v1/2023.eval4nlp-1.15
%P 184-192
Markdown (Informal)
[Characterised LLMs Affect its Evaluation of Summary and Translation](https://aclanthology.org/2023.eval4nlp-1.15) (Lu & Lin, Eval4NLP-WS 2023)
ACL