@inproceedings{skopek-etal-2023-towards,
title = "Towards Better Evaluation of Instruction-Following: A Case-Study in Summarization",
author = "Skopek, Ondrej and
Aralikatte, Rahul and
Gooding, Sian and
Carbune, Victor",
editor = "Jiang, Jing and
Reitter, David and
Deng, Shumin",
booktitle = "Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.conll-1.16",
doi = "10.18653/v1/2023.conll-1.16",
pages = "221--237",
abstract = "Despite recent advances, evaluating how well large language models (LLMs) follow user instructions remains an open problem. While evaluation methods of language models have seen a rise in prompt-based approaches, limited work on the correctness of these methods has been conducted. In this work, we perform a meta-evaluation of a variety of metrics to quantify how accurately they measure the instruction-following abilities of LLMs. Our investigation is performed on grounded query-based summarization by collecting a new short-form, real-world dataset riSum, containing 300 document-instruction pairs with 3 answers each. All 900 answers are rated by 3 human annotators. Using riSum, we analyze the agreement between evaluation methods and human judgment. Finally, we propose new LLM-based reference-free evaluation methods that improve upon established baselines and perform on par with costly reference-based metrics that require high-quality summaries.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="skopek-etal-2023-towards">
<titleInfo>
<title>Towards Better Evaluation of Instruction-Following: A Case-Study in Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Skopek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Aralikatte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sian</namePart>
<namePart type="family">Gooding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Carbune</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Reitter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shumin</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite recent advances, evaluating how well large language models (LLMs) follow user instructions remains an open problem. While evaluation methods of language models have seen a rise in prompt-based approaches, limited work on the correctness of these methods has been conducted. In this work, we perform a meta-evaluation of a variety of metrics to quantify how accurately they measure the instruction-following abilities of LLMs. Our investigation is performed on grounded query-based summarization by collecting a new short-form, real-world dataset riSum, containing 300 document-instruction pairs with 3 answers each. All 900 answers are rated by 3 human annotators. Using riSum, we analyze the agreement between evaluation methods and human judgment. Finally, we propose new LLM-based reference-free evaluation methods that improve upon established baselines and perform on par with costly reference-based metrics that require high-quality summaries.</abstract>
<identifier type="citekey">skopek-etal-2023-towards</identifier>
<identifier type="doi">10.18653/v1/2023.conll-1.16</identifier>
<location>
<url>https://aclanthology.org/2023.conll-1.16</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>221</start>
<end>237</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Better Evaluation of Instruction-Following: A Case-Study in Summarization
%A Skopek, Ondrej
%A Aralikatte, Rahul
%A Gooding, Sian
%A Carbune, Victor
%Y Jiang, Jing
%Y Reitter, David
%Y Deng, Shumin
%S Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F skopek-etal-2023-towards
%X Despite recent advances, evaluating how well large language models (LLMs) follow user instructions remains an open problem. While evaluation methods of language models have seen a rise in prompt-based approaches, limited work on the correctness of these methods has been conducted. In this work, we perform a meta-evaluation of a variety of metrics to quantify how accurately they measure the instruction-following abilities of LLMs. Our investigation is performed on grounded query-based summarization by collecting a new short-form, real-world dataset riSum, containing 300 document-instruction pairs with 3 answers each. All 900 answers are rated by 3 human annotators. Using riSum, we analyze the agreement between evaluation methods and human judgment. Finally, we propose new LLM-based reference-free evaluation methods that improve upon established baselines and perform on par with costly reference-based metrics that require high-quality summaries.
%R 10.18653/v1/2023.conll-1.16
%U https://aclanthology.org/2023.conll-1.16
%U https://doi.org/10.18653/v1/2023.conll-1.16
%P 221-237
Markdown (Informal)
[Towards Better Evaluation of Instruction-Following: A Case-Study in Summarization](https://aclanthology.org/2023.conll-1.16) (Skopek et al., CoNLL 2023)
ACL