@inproceedings{mille-etal-2021-another,
title = "Another {PASS}: A Reproduction Study of the Human Evaluation of a Football Report Generation System",
author = "Mille, Simon and
Castro Ferreira, Thiago and
Belz, Anya and
Davis, Brian",
editor = "Belz, Anya and
Fan, Angela and
Reiter, Ehud and
Sripada, Yaji",
booktitle = "Proceedings of the 14th International Conference on Natural Language Generation",
month = aug,
year = "2021",
address = "Aberdeen, Scotland, UK",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.inlg-1.30",
doi = "10.18653/v1/2021.inlg-1.30",
pages = "286--292",
abstract = "This paper reports results from a reproduction study in which we repeated the human evaluation of the PASS Dutch-language football report generation system (van der Lee et al., 2017). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluations in NLG, in Track A (Paper 1). We aimed to repeat the original study exactly, with the main difference that a different set of evaluators was used. We describe the study design, present the results from the original and the reproduction study, and then compare and analyse the differences between the two sets of results. For the two {`}headline{'} results of average Fluency and Clarity, we find that in both studies, the system was rated more highly for Clarity than for Fluency, and Clarity had higher standard deviation. Clarity and Fluency ratings were higher, and their standard deviations lower, in the reproduction study than in the original study by substantial margins. Clarity had a higher degree of reproducibility than Fluency, as measured by the coefficient of variation. Data and code are publicly available.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mille-etal-2021-another">
<titleInfo>
<title>Another PASS: A Reproduction Study of the Human Evaluation of a Football Report Generation System</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thiago</namePart>
<namePart type="family">Castro Ferreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Davis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaji</namePart>
<namePart type="family">Sripada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Aberdeen, Scotland, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper reports results from a reproduction study in which we repeated the human evaluation of the PASS Dutch-language football report generation system (van der Lee et al., 2017). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluations in NLG, in Track A (Paper 1). We aimed to repeat the original study exactly, with the main difference that a different set of evaluators was used. We describe the study design, present the results from the original and the reproduction study, and then compare and analyse the differences between the two sets of results. For the two ‘headline’ results of average Fluency and Clarity, we find that in both studies, the system was rated more highly for Clarity than for Fluency, and Clarity had higher standard deviation. Clarity and Fluency ratings were higher, and their standard deviations lower, in the reproduction study than in the original study by substantial margins. Clarity had a higher degree of reproducibility than Fluency, as measured by the coefficient of variation. Data and code are publicly available.</abstract>
<identifier type="citekey">mille-etal-2021-another</identifier>
<identifier type="doi">10.18653/v1/2021.inlg-1.30</identifier>
<location>
<url>https://aclanthology.org/2021.inlg-1.30</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>286</start>
<end>292</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Another PASS: A Reproduction Study of the Human Evaluation of a Football Report Generation System
%A Mille, Simon
%A Castro Ferreira, Thiago
%A Belz, Anya
%A Davis, Brian
%Y Belz, Anya
%Y Fan, Angela
%Y Reiter, Ehud
%Y Sripada, Yaji
%S Proceedings of the 14th International Conference on Natural Language Generation
%D 2021
%8 August
%I Association for Computational Linguistics
%C Aberdeen, Scotland, UK
%F mille-etal-2021-another
%X This paper reports results from a reproduction study in which we repeated the human evaluation of the PASS Dutch-language football report generation system (van der Lee et al., 2017). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluations in NLG, in Track A (Paper 1). We aimed to repeat the original study exactly, with the main difference that a different set of evaluators was used. We describe the study design, present the results from the original and the reproduction study, and then compare and analyse the differences between the two sets of results. For the two ‘headline’ results of average Fluency and Clarity, we find that in both studies, the system was rated more highly for Clarity than for Fluency, and Clarity had higher standard deviation. Clarity and Fluency ratings were higher, and their standard deviations lower, in the reproduction study than in the original study by substantial margins. Clarity had a higher degree of reproducibility than Fluency, as measured by the coefficient of variation. Data and code are publicly available.
%R 10.18653/v1/2021.inlg-1.30
%U https://aclanthology.org/2021.inlg-1.30
%U https://doi.org/10.18653/v1/2021.inlg-1.30
%P 286-292
Markdown (Informal)
[Another PASS: A Reproduction Study of the Human Evaluation of a Football Report Generation System](https://aclanthology.org/2021.inlg-1.30) (Mille et al., INLG 2021)
ACL