@inproceedings{elangovan-etal-2024-considers,
title = "{C}on{S}i{DERS}-The-Human Evaluation Framework: Rethinking Human Evaluation for Generative Large Language Models",
author = "Elangovan, Aparna and
Liu, Ling and
Xu, Lei and
Bodapati, Sravan Babu and
Roth, Dan",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.63/",
doi = "10.18653/v1/2024.acl-long.63",
pages = "1137--1160",
abstract = "In this position paper, we argue that human evaluation of generative large language models (LLMs) should be a multidisciplinary undertaking that draws upon the insights from disciplines such as user experience research and human behavioral psychology to ensure that the experimental design and results are reliable. The conclusions from these evaluations, therefore, must consider factors such as usability, aesthetics and cognitive biases. We highlight how cognitive biases can conflate fluent information and truthfulness, and how cognitive uncertainty affects the reliability of rating scores such as Likert. Furthermore, the evaluation should differentiate the capabilities and weaknesses of increasingly powerful large language models - which requires effective test sets. Scalability of human evaluation is also crucial to wider adoption. Hence, to design an effective human evaluation system in the age of generative NLP we propose the ConSiDERS-The-Human evaluation framework consisting of 6 pillars - Consistency, Scoring Criteria, Differentiating, User Experience, Responsible, and Scalability."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elangovan-etal-2024-considers">
<titleInfo>
<title>ConSiDERS-The-Human Evaluation Framework: Rethinking Human Evaluation for Generative Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Elangovan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ling</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sravan</namePart>
<namePart type="given">Babu</namePart>
<namePart type="family">Bodapati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this position paper, we argue that human evaluation of generative large language models (LLMs) should be a multidisciplinary undertaking that draws upon the insights from disciplines such as user experience research and human behavioral psychology to ensure that the experimental design and results are reliable. The conclusions from these evaluations, therefore, must consider factors such as usability, aesthetics and cognitive biases. We highlight how cognitive biases can conflate fluent information and truthfulness, and how cognitive uncertainty affects the reliability of rating scores such as Likert. Furthermore, the evaluation should differentiate the capabilities and weaknesses of increasingly powerful large language models - which requires effective test sets. Scalability of human evaluation is also crucial to wider adoption. Hence, to design an effective human evaluation system in the age of generative NLP we propose the ConSiDERS-The-Human evaluation framework consisting of 6 pillars - Consistency, Scoring Criteria, Differentiating, User Experience, Responsible, and Scalability.</abstract>
<identifier type="citekey">elangovan-etal-2024-considers</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.63</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.63/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>1137</start>
<end>1160</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ConSiDERS-The-Human Evaluation Framework: Rethinking Human Evaluation for Generative Large Language Models
%A Elangovan, Aparna
%A Liu, Ling
%A Xu, Lei
%A Bodapati, Sravan Babu
%A Roth, Dan
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F elangovan-etal-2024-considers
%X In this position paper, we argue that human evaluation of generative large language models (LLMs) should be a multidisciplinary undertaking that draws upon the insights from disciplines such as user experience research and human behavioral psychology to ensure that the experimental design and results are reliable. The conclusions from these evaluations, therefore, must consider factors such as usability, aesthetics and cognitive biases. We highlight how cognitive biases can conflate fluent information and truthfulness, and how cognitive uncertainty affects the reliability of rating scores such as Likert. Furthermore, the evaluation should differentiate the capabilities and weaknesses of increasingly powerful large language models - which requires effective test sets. Scalability of human evaluation is also crucial to wider adoption. Hence, to design an effective human evaluation system in the age of generative NLP we propose the ConSiDERS-The-Human evaluation framework consisting of 6 pillars - Consistency, Scoring Criteria, Differentiating, User Experience, Responsible, and Scalability.
%R 10.18653/v1/2024.acl-long.63
%U https://aclanthology.org/2024.luhme-long.63/
%U https://doi.org/10.18653/v1/2024.acl-long.63
%P 1137-1160
Markdown (Informal)
[ConSiDERS-The-Human Evaluation Framework: Rethinking Human Evaluation for Generative Large Language Models](https://aclanthology.org/2024.luhme-long.63/) (Elangovan et al., ACL 2024)
ACL