@inproceedings{braggaar-etal-2022-reproduction,
title = "A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019)",
author = "Braggaar, Anouck and
Tomas, Fr{\'e}d{\'e}ric and
Blomsma, Peter and
Hommes, Saar and
Braun, Nadine and
van Miltenburg, Emiel and
van der Lee, Chris and
Goudbeek, Martijn and
Krahmer, Emiel",
booktitle = "Proceedings of the 15th International Conference on Natural Language Generation: Generation Challenges",
month = jul,
year = "2022",
address = "Waterville, Maine, USA and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.inlg-genchal.13",
pages = "86--93",
abstract = "In this paper, we describe our reproduction ef- fort of the paper: Towards Best Experiment Design for Evaluating Dialogue System Output by Santhanam and Shaikh (2019) for the 2022 ReproGen shared task. We aim to produce the same results, using different human evaluators, and a different implementation of the automatic metrics used in the original paper. Although overall the study posed some challenges to re- produce (e.g. difficulties with reproduction of automatic metrics and statistics), in the end we did find that the results generally replicate the findings of Santhanam and Shaikh (2019) and seem to follow similar trends.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="braggaar-etal-2022-reproduction">
<titleInfo>
<title>A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anouck</namePart>
<namePart type="family">Braggaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Tomas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Blomsma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saar</namePart>
<namePart type="family">Hommes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadine</namePart>
<namePart type="family">Braun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">van Miltenburg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">van der Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martijn</namePart>
<namePart type="family">Goudbeek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emiel</namePart>
<namePart type="family">Krahmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Natural Language Generation: Generation Challenges</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Waterville, Maine, USA and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we describe our reproduction ef- fort of the paper: Towards Best Experiment Design for Evaluating Dialogue System Output by Santhanam and Shaikh (2019) for the 2022 ReproGen shared task. We aim to produce the same results, using different human evaluators, and a different implementation of the automatic metrics used in the original paper. Although overall the study posed some challenges to re- produce (e.g. difficulties with reproduction of automatic metrics and statistics), in the end we did find that the results generally replicate the findings of Santhanam and Shaikh (2019) and seem to follow similar trends.</abstract>
<identifier type="citekey">braggaar-etal-2022-reproduction</identifier>
<location>
<url>https://aclanthology.org/2022.inlg-genchal.13</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>86</start>
<end>93</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019)
%A Braggaar, Anouck
%A Tomas, Frédéric
%A Blomsma, Peter
%A Hommes, Saar
%A Braun, Nadine
%A van Miltenburg, Emiel
%A van der Lee, Chris
%A Goudbeek, Martijn
%A Krahmer, Emiel
%S Proceedings of the 15th International Conference on Natural Language Generation: Generation Challenges
%D 2022
%8 July
%I Association for Computational Linguistics
%C Waterville, Maine, USA and virtual meeting
%F braggaar-etal-2022-reproduction
%X In this paper, we describe our reproduction ef- fort of the paper: Towards Best Experiment Design for Evaluating Dialogue System Output by Santhanam and Shaikh (2019) for the 2022 ReproGen shared task. We aim to produce the same results, using different human evaluators, and a different implementation of the automatic metrics used in the original paper. Although overall the study posed some challenges to re- produce (e.g. difficulties with reproduction of automatic metrics and statistics), in the end we did find that the results generally replicate the findings of Santhanam and Shaikh (2019) and seem to follow similar trends.
%U https://aclanthology.org/2022.inlg-genchal.13
%P 86-93
Markdown (Informal)
[A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019)](https://aclanthology.org/2022.inlg-genchal.13) (Braggaar et al., INLG 2022)
ACL
- Anouck Braggaar, Frédéric Tomas, Peter Blomsma, Saar Hommes, Nadine Braun, Emiel van Miltenburg, Chris van der Lee, Martijn Goudbeek, and Emiel Krahmer. 2022. A reproduction study of methods for evaluating dialogue system output: Replicating Santhanam and Shaikh (2019). In Proceedings of the 15th International Conference on Natural Language Generation: Generation Challenges, pages 86–93, Waterville, Maine, USA and virtual meeting. Association for Computational Linguistics.