@inproceedings{pradhan-todi-2023-understanding,
title = "Understanding Large Language Model Based Metrics for Text Summarization",
author = "Pradhan, Abhishek and
Todi, Ketan",
editor = {Deutsch, Daniel and
Dror, Rotem and
Eger, Steffen and
Gao, Yang and
Leiter, Christoph and
Opitz, Juri and
R{\"u}ckl{\'e}, Andreas},
booktitle = "Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2023",
address = "Bali, Indonesia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eval4nlp-1.12",
doi = "10.18653/v1/2023.eval4nlp-1.12",
pages = "149--155",
abstract = "This paper compares the two most widely used techniques for evaluating generative tasks with large language models (LLMs): prompt-based evaluation and log-likelihood evaluation as part of the Eval4NLP shared task. We focus on the summarization task and evaluate both small and large LLM models. We also study the impact of LLAMA and LLAMA 2 on summarization, using the same set of prompts and techniques. We used the Eval4NLP dataset for our comparison. This study provides evidence of the advantages of prompt-based evaluation techniques over log-likelihood based techniques, especially for large models and models with better reasoning power.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pradhan-todi-2023-understanding">
<titleInfo>
<title>Understanding Large Language Model Based Metrics for Text Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abhishek</namePart>
<namePart type="family">Pradhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ketan</namePart>
<namePart type="family">Todi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Leiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juri</namePart>
<namePart type="family">Opitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Rücklé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bali, Indonesia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper compares the two most widely used techniques for evaluating generative tasks with large language models (LLMs): prompt-based evaluation and log-likelihood evaluation as part of the Eval4NLP shared task. We focus on the summarization task and evaluate both small and large LLM models. We also study the impact of LLAMA and LLAMA 2 on summarization, using the same set of prompts and techniques. We used the Eval4NLP dataset for our comparison. This study provides evidence of the advantages of prompt-based evaluation techniques over log-likelihood based techniques, especially for large models and models with better reasoning power.</abstract>
<identifier type="citekey">pradhan-todi-2023-understanding</identifier>
<identifier type="doi">10.18653/v1/2023.eval4nlp-1.12</identifier>
<location>
<url>https://aclanthology.org/2023.eval4nlp-1.12</url>
</location>
<part>
<date>2023-11</date>
<extent unit="page">
<start>149</start>
<end>155</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding Large Language Model Based Metrics for Text Summarization
%A Pradhan, Abhishek
%A Todi, Ketan
%Y Deutsch, Daniel
%Y Dror, Rotem
%Y Eger, Steffen
%Y Gao, Yang
%Y Leiter, Christoph
%Y Opitz, Juri
%Y Rücklé, Andreas
%S Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems
%D 2023
%8 November
%I Association for Computational Linguistics
%C Bali, Indonesia
%F pradhan-todi-2023-understanding
%X This paper compares the two most widely used techniques for evaluating generative tasks with large language models (LLMs): prompt-based evaluation and log-likelihood evaluation as part of the Eval4NLP shared task. We focus on the summarization task and evaluate both small and large LLM models. We also study the impact of LLAMA and LLAMA 2 on summarization, using the same set of prompts and techniques. We used the Eval4NLP dataset for our comparison. This study provides evidence of the advantages of prompt-based evaluation techniques over log-likelihood based techniques, especially for large models and models with better reasoning power.
%R 10.18653/v1/2023.eval4nlp-1.12
%U https://aclanthology.org/2023.eval4nlp-1.12
%U https://doi.org/10.18653/v1/2023.eval4nlp-1.12
%P 149-155
Markdown (Informal)
[Understanding Large Language Model Based Metrics for Text Summarization](https://aclanthology.org/2023.eval4nlp-1.12) (Pradhan & Todi, Eval4NLP-WS 2023)
ACL