@inproceedings{forde-etal-2024-evaluating,
title = "Re-Evaluating Evaluation for Multilingual Summarization",
author = "Forde, Jessica and
Zhang, Ruochen and
Sutawika, Lintang and
Aji, Alham and
Cahyawijaya, Samuel and
Winata, Genta and
Wu, Minghao and
Eickhoff, Carsten and
Biderman, Stella and
Pavlick, Ellie",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1085",
pages = "19476--19493",
abstract = "Automatic evaluation approaches (ROUGE, BERTScore, LLM-based evaluators) have been widely used to evaluate summarization tasks. Despite the complexities of script differences and tokenization, these approaches have been indiscriminately applied to summarization across multiple languages. While previous works have argued that these approaches correlate strongly with human ratings in English, it remains unclear whether the conclusion holds for other languages. To answer this question, we construct a small-scale pilot dataset containing article-summary pairs and human ratings in English, Chinese and Indonesian. To measure the strength of summaries, our ratings are measured as head-to-head comparisons with resulting Elo scores across four dimensions. Our analysis reveals that standard metrics are unreliable measures of quality, and that these problems are exacerbated in Chinese and Indonesian. We advocate for more nuanced and careful considerations in designing a robust evaluation framework for multiple languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="forde-etal-2024-evaluating">
<titleInfo>
<title>Re-Evaluating Evaluation for Multilingual Summarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jessica</namePart>
<namePart type="family">Forde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruochen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lintang</namePart>
<namePart type="family">Sutawika</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Cahyawijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minghao</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carsten</namePart>
<namePart type="family">Eickhoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stella</namePart>
<namePart type="family">Biderman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ellie</namePart>
<namePart type="family">Pavlick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic evaluation approaches (ROUGE, BERTScore, LLM-based evaluators) have been widely used to evaluate summarization tasks. Despite the complexities of script differences and tokenization, these approaches have been indiscriminately applied to summarization across multiple languages. While previous works have argued that these approaches correlate strongly with human ratings in English, it remains unclear whether the conclusion holds for other languages. To answer this question, we construct a small-scale pilot dataset containing article-summary pairs and human ratings in English, Chinese and Indonesian. To measure the strength of summaries, our ratings are measured as head-to-head comparisons with resulting Elo scores across four dimensions. Our analysis reveals that standard metrics are unreliable measures of quality, and that these problems are exacerbated in Chinese and Indonesian. We advocate for more nuanced and careful considerations in designing a robust evaluation framework for multiple languages.</abstract>
<identifier type="citekey">forde-etal-2024-evaluating</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1085</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>19476</start>
<end>19493</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Re-Evaluating Evaluation for Multilingual Summarization
%A Forde, Jessica
%A Zhang, Ruochen
%A Sutawika, Lintang
%A Aji, Alham
%A Cahyawijaya, Samuel
%A Winata, Genta
%A Wu, Minghao
%A Eickhoff, Carsten
%A Biderman, Stella
%A Pavlick, Ellie
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F forde-etal-2024-evaluating
%X Automatic evaluation approaches (ROUGE, BERTScore, LLM-based evaluators) have been widely used to evaluate summarization tasks. Despite the complexities of script differences and tokenization, these approaches have been indiscriminately applied to summarization across multiple languages. While previous works have argued that these approaches correlate strongly with human ratings in English, it remains unclear whether the conclusion holds for other languages. To answer this question, we construct a small-scale pilot dataset containing article-summary pairs and human ratings in English, Chinese and Indonesian. To measure the strength of summaries, our ratings are measured as head-to-head comparisons with resulting Elo scores across four dimensions. Our analysis reveals that standard metrics are unreliable measures of quality, and that these problems are exacerbated in Chinese and Indonesian. We advocate for more nuanced and careful considerations in designing a robust evaluation framework for multiple languages.
%U https://aclanthology.org/2024.emnlp-main.1085
%P 19476-19493
Markdown (Informal)
[Re-Evaluating Evaluation for Multilingual Summarization](https://aclanthology.org/2024.emnlp-main.1085) (Forde et al., EMNLP 2024)
ACL
- Jessica Forde, Ruochen Zhang, Lintang Sutawika, Alham Aji, Samuel Cahyawijaya, Genta Winata, Minghao Wu, Carsten Eickhoff, Stella Biderman, and Ellie Pavlick. 2024. Re-Evaluating Evaluation for Multilingual Summarization. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 19476–19493, Miami, Florida, USA. Association for Computational Linguistics.