@inproceedings{ghazarian-etal-2019-better,
title = "Better Automatic Evaluation of Open-Domain Dialogue Systems with Contextualized Embeddings",
author = "Ghazarian, Sarik and
Wei, Johnny and
Galstyan, Aram and
Peng, Nanyun",
editor = "Bosselut, Antoine and
Celikyilmaz, Asli and
Ghazvininejad, Marjan and
Iyer, Srinivasan and
Khandelwal, Urvashi and
Rashkin, Hannah and
Wolf, Thomas",
booktitle = "Proceedings of the Workshop on Methods for Optimizing and Evaluating Neural Language Generation",
month = jun,
year = "2019",
address = "Minneapolis, Minnesota",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-2310",
doi = "10.18653/v1/W19-2310",
pages = "82--89",
abstract = "Despite advances in open-domain dialogue systems, automatic evaluation of such systems is still a challenging problem. Traditional reference-based metrics such as BLEU are ineffective because there could be many valid responses for a given context that share no common words with reference responses. A recent work proposed Referenced metric and Unreferenced metric Blended Evaluation Routine (RUBER) to combine a learning-based metric, which predicts relatedness between a generated response and a given query, with reference-based metric; it showed high correlation with human judgments. In this paper, we explore using contextualized word embeddings to compute more accurate relatedness scores, thus better evaluation metrics. Experiments show that our evaluation metrics outperform RUBER, which is trained on static embeddings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ghazarian-etal-2019-better">
<titleInfo>
<title>Better Automatic Evaluation of Open-Domain Dialogue Systems with Contextualized Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarik</namePart>
<namePart type="family">Ghazarian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johnny</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aram</namePart>
<namePart type="family">Galstyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Methods for Optimizing and Evaluating Neural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Bosselut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asli</namePart>
<namePart type="family">Celikyilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marjan</namePart>
<namePart type="family">Ghazvininejad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srinivasan</namePart>
<namePart type="family">Iyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Urvashi</namePart>
<namePart type="family">Khandelwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannah</namePart>
<namePart type="family">Rashkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Wolf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Minneapolis, Minnesota</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite advances in open-domain dialogue systems, automatic evaluation of such systems is still a challenging problem. Traditional reference-based metrics such as BLEU are ineffective because there could be many valid responses for a given context that share no common words with reference responses. A recent work proposed Referenced metric and Unreferenced metric Blended Evaluation Routine (RUBER) to combine a learning-based metric, which predicts relatedness between a generated response and a given query, with reference-based metric; it showed high correlation with human judgments. In this paper, we explore using contextualized word embeddings to compute more accurate relatedness scores, thus better evaluation metrics. Experiments show that our evaluation metrics outperform RUBER, which is trained on static embeddings.</abstract>
<identifier type="citekey">ghazarian-etal-2019-better</identifier>
<identifier type="doi">10.18653/v1/W19-2310</identifier>
<location>
<url>https://aclanthology.org/W19-2310</url>
</location>
<part>
<date>2019-06</date>
<extent unit="page">
<start>82</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Better Automatic Evaluation of Open-Domain Dialogue Systems with Contextualized Embeddings
%A Ghazarian, Sarik
%A Wei, Johnny
%A Galstyan, Aram
%A Peng, Nanyun
%Y Bosselut, Antoine
%Y Celikyilmaz, Asli
%Y Ghazvininejad, Marjan
%Y Iyer, Srinivasan
%Y Khandelwal, Urvashi
%Y Rashkin, Hannah
%Y Wolf, Thomas
%S Proceedings of the Workshop on Methods for Optimizing and Evaluating Neural Language Generation
%D 2019
%8 June
%I Association for Computational Linguistics
%C Minneapolis, Minnesota
%F ghazarian-etal-2019-better
%X Despite advances in open-domain dialogue systems, automatic evaluation of such systems is still a challenging problem. Traditional reference-based metrics such as BLEU are ineffective because there could be many valid responses for a given context that share no common words with reference responses. A recent work proposed Referenced metric and Unreferenced metric Blended Evaluation Routine (RUBER) to combine a learning-based metric, which predicts relatedness between a generated response and a given query, with reference-based metric; it showed high correlation with human judgments. In this paper, we explore using contextualized word embeddings to compute more accurate relatedness scores, thus better evaluation metrics. Experiments show that our evaluation metrics outperform RUBER, which is trained on static embeddings.
%R 10.18653/v1/W19-2310
%U https://aclanthology.org/W19-2310
%U https://doi.org/10.18653/v1/W19-2310
%P 82-89
Markdown (Informal)
[Better Automatic Evaluation of Open-Domain Dialogue Systems with Contextualized Embeddings](https://aclanthology.org/W19-2310) (Ghazarian et al., NAACL 2019)
ACL