@inproceedings{sun-etal-2019-compare,
    title = "How to Compare Summarizers without Target Length? Pitfalls, Solutions and Re-Examination of the Neural Summarization Literature",
    author = "Sun, Simeng  and
      Shapira, Ori  and
      Dagan, Ido  and
      Nenkova, Ani",
    editor = "Bosselut, Antoine  and
      Celikyilmaz, Asli  and
      Ghazvininejad, Marjan  and
      Iyer, Srinivasan  and
      Khandelwal, Urvashi  and
      Rashkin, Hannah  and
      Wolf, Thomas",
    booktitle = "Proceedings of the Workshop on Methods for Optimizing and Evaluating Neural Language Generation",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/W19-2303/",
    doi = "10.18653/v1/W19-2303",
    pages = "21--29",
    abstract = "We show that plain ROUGE F1 scores are not ideal for comparing current neural systems which on average produce different lengths. This is due to a non-linear pattern between ROUGE F1 and summary length. To alleviate the effect of length during evaluation, we have proposed a new method which normalizes the ROUGE F1 scores of a system by that of a random system with same average output length. A pilot human evaluation has shown that humans prefer short summaries in terms of the verbosity of a summary but overall consider longer summaries to be of higher quality. While human evaluations are more expensive in time and resources, it is clear that normalization, such as the one we proposed for automatic evaluation, will make human evaluations more meaningful."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2019-compare">
    <titleInfo>
        <title>How to Compare Summarizers without Target Length? Pitfalls, Solutions and Re-Examination of the Neural Summarization Literature</title>
    </titleInfo>
    <name type="personal">
        <namePart type="given">Simeng</namePart>
        <namePart type="family">Sun</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Ori</namePart>
        <namePart type="family">Shapira</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Ido</namePart>
        <namePart type="family">Dagan</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <name type="personal">
        <namePart type="given">Ani</namePart>
        <namePart type="family">Nenkova</namePart>
        <role>
            <roleTerm authority="marcrelator" type="text">author</roleTerm>
        </role>
    </name>
    <originInfo>
        <dateIssued>2019-06</dateIssued>
    </originInfo>
    <typeOfResource>text</typeOfResource>
    <relatedItem type="host">
        <titleInfo>
            <title>Proceedings of the Workshop on Methods for Optimizing and Evaluating Neural Language Generation</title>
        </titleInfo>
        <name type="personal">
            <namePart type="given">Antoine</namePart>
            <namePart type="family">Bosselut</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Asli</namePart>
            <namePart type="family">Celikyilmaz</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Marjan</namePart>
            <namePart type="family">Ghazvininejad</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Srinivasan</namePart>
            <namePart type="family">Iyer</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Urvashi</namePart>
            <namePart type="family">Khandelwal</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Hannah</namePart>
            <namePart type="family">Rashkin</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <name type="personal">
            <namePart type="given">Thomas</namePart>
            <namePart type="family">Wolf</namePart>
            <role>
                <roleTerm authority="marcrelator" type="text">editor</roleTerm>
            </role>
        </name>
        <originInfo>
            <publisher>Association for Computational Linguistics</publisher>
            <place>
                <placeTerm type="text">Minneapolis, Minnesota</placeTerm>
            </place>
        </originInfo>
        <genre authority="marcgt">conference publication</genre>
    </relatedItem>
    <abstract>We show that plain ROUGE F1 scores are not ideal for comparing current neural systems which on average produce different lengths. This is due to a non-linear pattern between ROUGE F1 and summary length. To alleviate the effect of length during evaluation, we have proposed a new method which normalizes the ROUGE F1 scores of a system by that of a random system with same average output length. A pilot human evaluation has shown that humans prefer short summaries in terms of the verbosity of a summary but overall consider longer summaries to be of higher quality. While human evaluations are more expensive in time and resources, it is clear that normalization, such as the one we proposed for automatic evaluation, will make human evaluations more meaningful.</abstract>
    <identifier type="citekey">sun-etal-2019-compare</identifier>
    <identifier type="doi">10.18653/v1/W19-2303</identifier>
    <location>
        <url>https://aclanthology.org/W19-2303/</url>
    </location>
    <part>
        <date>2019-06</date>
        <extent unit="page">
            <start>21</start>
            <end>29</end>
        </extent>
    </part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How to Compare Summarizers without Target Length? Pitfalls, Solutions and Re-Examination of the Neural Summarization Literature
%A Sun, Simeng
%A Shapira, Ori
%A Dagan, Ido
%A Nenkova, Ani
%Y Bosselut, Antoine
%Y Celikyilmaz, Asli
%Y Ghazvininejad, Marjan
%Y Iyer, Srinivasan
%Y Khandelwal, Urvashi
%Y Rashkin, Hannah
%Y Wolf, Thomas
%S Proceedings of the Workshop on Methods for Optimizing and Evaluating Neural Language Generation
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, Minnesota
%F sun-etal-2019-compare
%X We show that plain ROUGE F1 scores are not ideal for comparing current neural systems which on average produce different lengths. This is due to a non-linear pattern between ROUGE F1 and summary length. To alleviate the effect of length during evaluation, we have proposed a new method which normalizes the ROUGE F1 scores of a system by that of a random system with same average output length. A pilot human evaluation has shown that humans prefer short summaries in terms of the verbosity of a summary but overall consider longer summaries to be of higher quality. While human evaluations are more expensive in time and resources, it is clear that normalization, such as the one we proposed for automatic evaluation, will make human evaluations more meaningful.
%R 10.18653/v1/W19-2303
%U https://aclanthology.org/W19-2303/
%U https://doi.org/10.18653/v1/W19-2303
%P 21-29
Markdown (Informal)
[How to Compare Summarizers without Target Length? Pitfalls, Solutions and Re-Examination of the Neural Summarization Literature](https://aclanthology.org/W19-2303/) (Sun et al., NAACL 2019)
ACL