@inproceedings{demeter-etal-2023-multi,
title = "Multi-domain Summarization from Leaderboards to Practice: Re-examining Automatic and Human Evaluation",
author = "Demeter, David and
Agarwal, Oshin and
Ben Igeri, Simon and
Sterbentz, Marko and
Molino, Neil and
Conroy, John and
Nenkova, Ani",
editor = "Gehrmann, Sebastian and
Wang, Alex and
Sedoc, Jo{\~a}o and
Clark, Elizabeth and
Dhole, Kaustubh and
Chandu, Khyathi Raghavi and
Santus, Enrico and
Sedghamiz, Hooman",
booktitle = "Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.gem-1.20",
pages = "233--242",
abstract = "Existing literature does not give much guidance on how to build the best possible multi-domain summarization model from existing components. We present an extensive evaluation of popular pre-trained models on a wide range of datasets to inform the selection of both the model and the training data for robust summarization across several domains. We find that fine-tuned BART performs better than T5 and PEGASUS, both on in-domain and out-of-domain data, regardless of the dataset used for fine-tuning. While BART has the best performance, it does vary considerably across domains. A multi-domain summarizer that works well for all domains can be built by simply fine-tuning on diverse domains. It even performs better than an in-domain summarizer, even when using fewer total training examples. While the success of such a multi-domain summarization model is clear through automatic evaluation, by conducting a human evaluation, we find that there are variations that can not be captured by any of the automatic evaluation metrics and thus not reflected in standard leaderboards. Furthermore, we find that conducting reliable human evaluation can be complex as well. Even experienced summarization researchers can be inconsistent with one another in their assessment of the quality of a summary, and also with themselves when re-annotating the same summary. The findings of our study are two-fold. First, BART fine-tuned on heterogeneous domains is a great multi-domain summarizer for practical purposes. At the same time, we need to re-examine not just automatic evaluation metrics but also human evaluation methods to responsibly measure progress in summarization.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="demeter-etal-2023-multi">
<titleInfo>
<title>Multi-domain Summarization from Leaderboards to Practice: Re-examining Automatic and Human Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Demeter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oshin</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Ben Igeri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Sterbentz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neil</namePart>
<namePart type="family">Molino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Conroy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ani</namePart>
<namePart type="family">Nenkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Clark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="given">Raghavi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hooman</namePart>
<namePart type="family">Sedghamiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Existing literature does not give much guidance on how to build the best possible multi-domain summarization model from existing components. We present an extensive evaluation of popular pre-trained models on a wide range of datasets to inform the selection of both the model and the training data for robust summarization across several domains. We find that fine-tuned BART performs better than T5 and PEGASUS, both on in-domain and out-of-domain data, regardless of the dataset used for fine-tuning. While BART has the best performance, it does vary considerably across domains. A multi-domain summarizer that works well for all domains can be built by simply fine-tuning on diverse domains. It even performs better than an in-domain summarizer, even when using fewer total training examples. While the success of such a multi-domain summarization model is clear through automatic evaluation, by conducting a human evaluation, we find that there are variations that can not be captured by any of the automatic evaluation metrics and thus not reflected in standard leaderboards. Furthermore, we find that conducting reliable human evaluation can be complex as well. Even experienced summarization researchers can be inconsistent with one another in their assessment of the quality of a summary, and also with themselves when re-annotating the same summary. The findings of our study are two-fold. First, BART fine-tuned on heterogeneous domains is a great multi-domain summarizer for practical purposes. At the same time, we need to re-examine not just automatic evaluation metrics but also human evaluation methods to responsibly measure progress in summarization.</abstract>
<identifier type="citekey">demeter-etal-2023-multi</identifier>
<location>
<url>https://aclanthology.org/2023.gem-1.20</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>233</start>
<end>242</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-domain Summarization from Leaderboards to Practice: Re-examining Automatic and Human Evaluation
%A Demeter, David
%A Agarwal, Oshin
%A Ben Igeri, Simon
%A Sterbentz, Marko
%A Molino, Neil
%A Conroy, John
%A Nenkova, Ani
%Y Gehrmann, Sebastian
%Y Wang, Alex
%Y Sedoc, João
%Y Clark, Elizabeth
%Y Dhole, Kaustubh
%Y Chandu, Khyathi Raghavi
%Y Santus, Enrico
%Y Sedghamiz, Hooman
%S Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F demeter-etal-2023-multi
%X Existing literature does not give much guidance on how to build the best possible multi-domain summarization model from existing components. We present an extensive evaluation of popular pre-trained models on a wide range of datasets to inform the selection of both the model and the training data for robust summarization across several domains. We find that fine-tuned BART performs better than T5 and PEGASUS, both on in-domain and out-of-domain data, regardless of the dataset used for fine-tuning. While BART has the best performance, it does vary considerably across domains. A multi-domain summarizer that works well for all domains can be built by simply fine-tuning on diverse domains. It even performs better than an in-domain summarizer, even when using fewer total training examples. While the success of such a multi-domain summarization model is clear through automatic evaluation, by conducting a human evaluation, we find that there are variations that can not be captured by any of the automatic evaluation metrics and thus not reflected in standard leaderboards. Furthermore, we find that conducting reliable human evaluation can be complex as well. Even experienced summarization researchers can be inconsistent with one another in their assessment of the quality of a summary, and also with themselves when re-annotating the same summary. The findings of our study are two-fold. First, BART fine-tuned on heterogeneous domains is a great multi-domain summarizer for practical purposes. At the same time, we need to re-examine not just automatic evaluation metrics but also human evaluation methods to responsibly measure progress in summarization.
%U https://aclanthology.org/2023.gem-1.20
%P 233-242
Markdown (Informal)
[Multi-domain Summarization from Leaderboards to Practice: Re-examining Automatic and Human Evaluation](https://aclanthology.org/2023.gem-1.20) (Demeter et al., GEM-WS 2023)
ACL
- David Demeter, Oshin Agarwal, Simon Ben Igeri, Marko Sterbentz, Neil Molino, John Conroy, and Ani Nenkova. 2023. Multi-domain Summarization from Leaderboards to Practice: Re-examining Automatic and Human Evaluation. In Proceedings of the Third Workshop on Natural Language Generation, Evaluation, and Metrics (GEM), pages 233–242, Singapore. Association for Computational Linguistics.