@inproceedings{wu-chien-2020-learning,
title = "Learning the Human Judgment for the Automatic Evaluation of Chatbot",
author = "Wu, Shih-Hung and
Chien, Sheng-Lun",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.198",
pages = "1598--1602",
abstract = "It is hard to evaluate the quality of the generated text by a generative dialogue system. Currently, dialogue evaluation relies on human judges to label the quality of the generated text. It is not a reusable mechanism that can give consistent evaluation for system developers. We believe that it is easier to get consistent results on comparing two generated dialogue by two systems and it is hard to give a consistent quality score on only one system at a time. In this paper, we propose a machine learning approach to reduce the effort of human evaluation by learning the human judgment on comparing two dialogue systems. Training from the human labeling result, the evaluation model learns which generative models is better in each dialog context. Thus, it can be used for system developers to compare the fine-tuned models over and over again without the human labor. In our experiment we find the agreement between the learned model and human judge is 70{\%}. The experiment is conducted on comparing two attention based GRU-RNN generative models.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-chien-2020-learning">
<titleInfo>
<title>Learning the Human Judgment for the Automatic Evaluation of Chatbot</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shih-Hung</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng-Lun</namePart>
<namePart type="family">Chien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>It is hard to evaluate the quality of the generated text by a generative dialogue system. Currently, dialogue evaluation relies on human judges to label the quality of the generated text. It is not a reusable mechanism that can give consistent evaluation for system developers. We believe that it is easier to get consistent results on comparing two generated dialogue by two systems and it is hard to give a consistent quality score on only one system at a time. In this paper, we propose a machine learning approach to reduce the effort of human evaluation by learning the human judgment on comparing two dialogue systems. Training from the human labeling result, the evaluation model learns which generative models is better in each dialog context. Thus, it can be used for system developers to compare the fine-tuned models over and over again without the human labor. In our experiment we find the agreement between the learned model and human judge is 70%. The experiment is conducted on comparing two attention based GRU-RNN generative models.</abstract>
<identifier type="citekey">wu-chien-2020-learning</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.198</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>1598</start>
<end>1602</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning the Human Judgment for the Automatic Evaluation of Chatbot
%A Wu, Shih-Hung
%A Chien, Sheng-Lun
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F wu-chien-2020-learning
%X It is hard to evaluate the quality of the generated text by a generative dialogue system. Currently, dialogue evaluation relies on human judges to label the quality of the generated text. It is not a reusable mechanism that can give consistent evaluation for system developers. We believe that it is easier to get consistent results on comparing two generated dialogue by two systems and it is hard to give a consistent quality score on only one system at a time. In this paper, we propose a machine learning approach to reduce the effort of human evaluation by learning the human judgment on comparing two dialogue systems. Training from the human labeling result, the evaluation model learns which generative models is better in each dialog context. Thus, it can be used for system developers to compare the fine-tuned models over and over again without the human labor. In our experiment we find the agreement between the learned model and human judge is 70%. The experiment is conducted on comparing two attention based GRU-RNN generative models.
%U https://aclanthology.org/2020.lrec-1.198
%P 1598-1602
Markdown (Informal)
[Learning the Human Judgment for the Automatic Evaluation of Chatbot](https://aclanthology.org/2020.lrec-1.198) (Wu & Chien, LREC 2020)
ACL