@inproceedings{popovic-belz-2021-reproduction,
title = "A Reproduction Study of an Annotation-based Human Evaluation of {MT} Outputs",
author = "Popovi{\'c}, Maja and
Belz, Anya",
booktitle = "Proceedings of the 14th International Conference on Natural Language Generation",
month = aug,
year = "2021",
address = "Aberdeen, Scotland, UK",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.inlg-1.31",
pages = "293--300",
abstract = "In this paper we report our reproduction study of the Croatian part of an annotation-based human evaluation of machine-translated user reviews (Popovic, 2020). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluation in NLG. Our aim was to repeat the original study exactly, except for using a different set of evaluators. We describe the experimental design, characterise differences between original and reproduction study, and present the results from each study, along with analysis of the similarity between them. For the six main evaluation results of Major/Minor/All Comprehension error rates and Major/Minor/All Adequacy error rates, we find that (i) 4/6 system rankings are the same in both studies, (ii) the relative differences between systems are replicated well for Major Comprehension and Adequacy (Pearson{'}s {\textgreater} 0.9), but not for the corresponding Minor error rates (Pearson{'}s 0.36 for Adequacy, 0.67 for Comprehension), and (iii) the individual system scores for both types of Minor error rates had a higher degree of reproducibility than the corresponding Major error rates. We also examine inter-annotator agreement and compare the annotations obtained in the original and reproduction studies.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="popovic-belz-2021-reproduction">
<titleInfo>
<title>A Reproduction Study of an Annotation-based Human Evaluation of MT Outputs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Natural Language Generation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Aberdeen, Scotland, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we report our reproduction study of the Croatian part of an annotation-based human evaluation of machine-translated user reviews (Popovic, 2020). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluation in NLG. Our aim was to repeat the original study exactly, except for using a different set of evaluators. We describe the experimental design, characterise differences between original and reproduction study, and present the results from each study, along with analysis of the similarity between them. For the six main evaluation results of Major/Minor/All Comprehension error rates and Major/Minor/All Adequacy error rates, we find that (i) 4/6 system rankings are the same in both studies, (ii) the relative differences between systems are replicated well for Major Comprehension and Adequacy (Pearson’s \textgreater 0.9), but not for the corresponding Minor error rates (Pearson’s 0.36 for Adequacy, 0.67 for Comprehension), and (iii) the individual system scores for both types of Minor error rates had a higher degree of reproducibility than the corresponding Major error rates. We also examine inter-annotator agreement and compare the annotations obtained in the original and reproduction studies.</abstract>
<identifier type="citekey">popovic-belz-2021-reproduction</identifier>
<location>
<url>https://aclanthology.org/2021.inlg-1.31</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>293</start>
<end>300</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Reproduction Study of an Annotation-based Human Evaluation of MT Outputs
%A Popović, Maja
%A Belz, Anya
%S Proceedings of the 14th International Conference on Natural Language Generation
%D 2021
%8 August
%I Association for Computational Linguistics
%C Aberdeen, Scotland, UK
%F popovic-belz-2021-reproduction
%X In this paper we report our reproduction study of the Croatian part of an annotation-based human evaluation of machine-translated user reviews (Popovic, 2020). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluation in NLG. Our aim was to repeat the original study exactly, except for using a different set of evaluators. We describe the experimental design, characterise differences between original and reproduction study, and present the results from each study, along with analysis of the similarity between them. For the six main evaluation results of Major/Minor/All Comprehension error rates and Major/Minor/All Adequacy error rates, we find that (i) 4/6 system rankings are the same in both studies, (ii) the relative differences between systems are replicated well for Major Comprehension and Adequacy (Pearson’s \textgreater 0.9), but not for the corresponding Minor error rates (Pearson’s 0.36 for Adequacy, 0.67 for Comprehension), and (iii) the individual system scores for both types of Minor error rates had a higher degree of reproducibility than the corresponding Major error rates. We also examine inter-annotator agreement and compare the annotations obtained in the original and reproduction studies.
%U https://aclanthology.org/2021.inlg-1.31
%P 293-300
Markdown (Informal)
[A Reproduction Study of an Annotation-based Human Evaluation of MT Outputs](https://aclanthology.org/2021.inlg-1.31) (Popović & Belz, INLG 2021)
ACL