@inproceedings{deutsch-roth-2021-understanding,
title = "Understanding the Extent to which Content Quality Metrics Measure the Information Quality of Summaries",
author = "Deutsch, Daniel and
Roth, Dan",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.24",
doi = "10.18653/v1/2021.conll-1.24",
pages = "300--309",
abstract = "Reference-based metrics such as ROUGE or BERTScore evaluate the content quality of a summary by comparing the summary to a reference. Ideally, this comparison should measure the summary{'}s information quality by calculating how much information the summaries have in common. In this work, we analyze the token alignments used by ROUGE and BERTScore to compare summaries and argue that their scores largely cannot be interpreted as measuring information overlap. Rather, they are better estimates of the extent to which the summaries discuss the same topics. Further, we provide evidence that this result holds true for many other summarization evaluation metrics. The consequence of this result is that the most frequently used summarization evaluation metrics do not align with the community{'}s research goal, to generate summaries with high-quality information. However, we conclude by demonstrating that a recently proposed metric, QAEval, which scores summaries using question-answering, appears to better capture information quality than current evaluations, highlighting a direction for future research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deutsch-roth-2021-understanding">
<titleInfo>
<title>Understanding the Extent to which Content Quality Metrics Measure the Information Quality of Summaries</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Reference-based metrics such as ROUGE or BERTScore evaluate the content quality of a summary by comparing the summary to a reference. Ideally, this comparison should measure the summary’s information quality by calculating how much information the summaries have in common. In this work, we analyze the token alignments used by ROUGE and BERTScore to compare summaries and argue that their scores largely cannot be interpreted as measuring information overlap. Rather, they are better estimates of the extent to which the summaries discuss the same topics. Further, we provide evidence that this result holds true for many other summarization evaluation metrics. The consequence of this result is that the most frequently used summarization evaluation metrics do not align with the community’s research goal, to generate summaries with high-quality information. However, we conclude by demonstrating that a recently proposed metric, QAEval, which scores summaries using question-answering, appears to better capture information quality than current evaluations, highlighting a direction for future research.</abstract>
<identifier type="citekey">deutsch-roth-2021-understanding</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.24</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.24</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>300</start>
<end>309</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Understanding the Extent to which Content Quality Metrics Measure the Information Quality of Summaries
%A Deutsch, Daniel
%A Roth, Dan
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F deutsch-roth-2021-understanding
%X Reference-based metrics such as ROUGE or BERTScore evaluate the content quality of a summary by comparing the summary to a reference. Ideally, this comparison should measure the summary’s information quality by calculating how much information the summaries have in common. In this work, we analyze the token alignments used by ROUGE and BERTScore to compare summaries and argue that their scores largely cannot be interpreted as measuring information overlap. Rather, they are better estimates of the extent to which the summaries discuss the same topics. Further, we provide evidence that this result holds true for many other summarization evaluation metrics. The consequence of this result is that the most frequently used summarization evaluation metrics do not align with the community’s research goal, to generate summaries with high-quality information. However, we conclude by demonstrating that a recently proposed metric, QAEval, which scores summaries using question-answering, appears to better capture information quality than current evaluations, highlighting a direction for future research.
%R 10.18653/v1/2021.conll-1.24
%U https://aclanthology.org/2021.conll-1.24
%U https://doi.org/10.18653/v1/2021.conll-1.24
%P 300-309
Markdown (Informal)
[Understanding the Extent to which Content Quality Metrics Measure the Information Quality of Summaries](https://aclanthology.org/2021.conll-1.24) (Deutsch & Roth, CoNLL 2021)
ACL