@inproceedings{tay-etal-2019-red,
title = "Red-faced {ROUGE}: Examining the Suitability of {ROUGE} for Opinion Summary Evaluation",
author = "Tay, Wenyi and
Joshi, Aditya and
Zhang, Xiuzhen and
Karimi, Sarvnaz and
Wan, Stephen",
editor = "Mistica, Meladel and
Piccardi, Massimo and
MacKinlay, Andrew",
booktitle = "Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association",
month = "4--6 " # dec,
year = "2019",
address = "Sydney, Australia",
publisher = "Australasian Language Technology Association",
url = "https://aclanthology.org/U19-1008",
pages = "52--60",
abstract = "One of the most common metrics to automatically evaluate opinion summaries is ROUGE, a metric developed for text summarisation. ROUGE counts the overlap of word or word units between a candidate summary against reference summaries. This formulation treats all words in the reference summary equally. In opinion summaries, however, not all words in the reference are equally important. Opinion summarisation requires to correctly pair two types of semantic information: (1) aspect or opinion target; and (2) polarity of candidate and reference summaries. We investigate the suitability of ROUGE for evaluating opin-ion summaries of online reviews. Using three simulation-based experiments, we evaluate the behaviour of ROUGE for opinion summarisation on the ability to match aspect and polarity. We show that ROUGE cannot distinguish opinion summaries of similar or opposite polarities for the same aspect. Moreover,ROUGE scores have significant variance under different configuration settings. As a result, we present three recommendations for future work that uses ROUGE to evaluate opinion summarisation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tay-etal-2019-red">
<titleInfo>
<title>Red-faced ROUGE: Examining the Suitability of ROUGE for Opinion Summary Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenyi</namePart>
<namePart type="family">Tay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiuzhen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarvnaz</namePart>
<namePart type="family">Karimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-4–6 dec</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association</title>
</titleInfo>
<name type="personal">
<namePart type="given">Meladel</namePart>
<namePart type="family">Mistica</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Massimo</namePart>
<namePart type="family">Piccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">MacKinlay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Australasian Language Technology Association</publisher>
<place>
<placeTerm type="text">Sydney, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>One of the most common metrics to automatically evaluate opinion summaries is ROUGE, a metric developed for text summarisation. ROUGE counts the overlap of word or word units between a candidate summary against reference summaries. This formulation treats all words in the reference summary equally. In opinion summaries, however, not all words in the reference are equally important. Opinion summarisation requires to correctly pair two types of semantic information: (1) aspect or opinion target; and (2) polarity of candidate and reference summaries. We investigate the suitability of ROUGE for evaluating opin-ion summaries of online reviews. Using three simulation-based experiments, we evaluate the behaviour of ROUGE for opinion summarisation on the ability to match aspect and polarity. We show that ROUGE cannot distinguish opinion summaries of similar or opposite polarities for the same aspect. Moreover,ROUGE scores have significant variance under different configuration settings. As a result, we present three recommendations for future work that uses ROUGE to evaluate opinion summarisation.</abstract>
<identifier type="citekey">tay-etal-2019-red</identifier>
<location>
<url>https://aclanthology.org/U19-1008</url>
</location>
<part>
<date>2019-4–6 dec</date>
<extent unit="page">
<start>52</start>
<end>60</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Red-faced ROUGE: Examining the Suitability of ROUGE for Opinion Summary Evaluation
%A Tay, Wenyi
%A Joshi, Aditya
%A Zhang, Xiuzhen
%A Karimi, Sarvnaz
%A Wan, Stephen
%Y Mistica, Meladel
%Y Piccardi, Massimo
%Y MacKinlay, Andrew
%S Proceedings of the 17th Annual Workshop of the Australasian Language Technology Association
%D 2019
%8 4–6 dec
%I Australasian Language Technology Association
%C Sydney, Australia
%F tay-etal-2019-red
%X One of the most common metrics to automatically evaluate opinion summaries is ROUGE, a metric developed for text summarisation. ROUGE counts the overlap of word or word units between a candidate summary against reference summaries. This formulation treats all words in the reference summary equally. In opinion summaries, however, not all words in the reference are equally important. Opinion summarisation requires to correctly pair two types of semantic information: (1) aspect or opinion target; and (2) polarity of candidate and reference summaries. We investigate the suitability of ROUGE for evaluating opin-ion summaries of online reviews. Using three simulation-based experiments, we evaluate the behaviour of ROUGE for opinion summarisation on the ability to match aspect and polarity. We show that ROUGE cannot distinguish opinion summaries of similar or opposite polarities for the same aspect. Moreover,ROUGE scores have significant variance under different configuration settings. As a result, we present three recommendations for future work that uses ROUGE to evaluate opinion summarisation.
%U https://aclanthology.org/U19-1008
%P 52-60
Markdown (Informal)
[Red-faced ROUGE: Examining the Suitability of ROUGE for Opinion Summary Evaluation](https://aclanthology.org/U19-1008) (Tay et al., ALTA 2019)
ACL