@inproceedings{coca-etal-2021-gcdf1,
title = "{GCDF}1: A Goal- and Context- Driven {F}-Score for Evaluating User Models",
author = "Coca, Alexandru and
Tseng, Bo-Hsiang and
Byrne, Bill",
editor = "Wei, Wei and
Dai, Bo and
Zhao, Tuo and
Li, Lihong and
Yang, Diyi and
Chen, Yun-Nung and
Boureau, Y-Lan and
Celikyilmaz, Asli and
Geramifard, Alborz and
Ahuja, Aman and
Jiang, Haoming",
booktitle = "The First Workshop on Evaluations and Assessments of Neural Conversation Systems",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eancs-1.2",
doi = "10.18653/v1/2021.eancs-1.2",
pages = "7--14",
abstract = "The evaluation of dialogue systems in interaction with simulated users has been proposed to improve turn-level, corpus-based metrics which can only evaluate test cases encountered in a corpus and cannot measure system{'}s ability to sustain multi-turn interactions. Recently, little emphasis was put on automatically assessing the quality of the user model itself, so unless correlations with human studies are measured, the reliability of user model based evaluation is unknown. We propose GCDF1, a simple but effective measure of the quality of semantic-level conversations between a goal-driven user agent and a system agent. In contrast with previous approaches we measure the F-score at dialogue level and consider user and system behaviours to improve recall and precision estimation. We facilitate scores interpretation by providing a rich hierarchical structure with information about conversational patterns present in the test data and tools to efficiently query the conversations generated. We apply our framework to assess the performance and weaknesses of a Convlab2 user model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="coca-etal-2021-gcdf1">
<titleInfo>
<title>GCDF1: A Goal- and Context- Driven F-Score for Evaluating User Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexandru</namePart>
<namePart type="family">Coca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo-Hsiang</namePart>
<namePart type="family">Tseng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bill</namePart>
<namePart type="family">Byrne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The First Workshop on Evaluations and Assessments of Neural Conversation Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuo</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lihong</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diyi</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Y-Lan</namePart>
<namePart type="family">Boureau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asli</namePart>
<namePart type="family">Celikyilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alborz</namePart>
<namePart type="family">Geramifard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Ahuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoming</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The evaluation of dialogue systems in interaction with simulated users has been proposed to improve turn-level, corpus-based metrics which can only evaluate test cases encountered in a corpus and cannot measure system’s ability to sustain multi-turn interactions. Recently, little emphasis was put on automatically assessing the quality of the user model itself, so unless correlations with human studies are measured, the reliability of user model based evaluation is unknown. We propose GCDF1, a simple but effective measure of the quality of semantic-level conversations between a goal-driven user agent and a system agent. In contrast with previous approaches we measure the F-score at dialogue level and consider user and system behaviours to improve recall and precision estimation. We facilitate scores interpretation by providing a rich hierarchical structure with information about conversational patterns present in the test data and tools to efficiently query the conversations generated. We apply our framework to assess the performance and weaknesses of a Convlab2 user model.</abstract>
<identifier type="citekey">coca-etal-2021-gcdf1</identifier>
<identifier type="doi">10.18653/v1/2021.eancs-1.2</identifier>
<location>
<url>https://aclanthology.org/2021.eancs-1.2</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>7</start>
<end>14</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GCDF1: A Goal- and Context- Driven F-Score for Evaluating User Models
%A Coca, Alexandru
%A Tseng, Bo-Hsiang
%A Byrne, Bill
%Y Wei, Wei
%Y Dai, Bo
%Y Zhao, Tuo
%Y Li, Lihong
%Y Yang, Diyi
%Y Chen, Yun-Nung
%Y Boureau, Y-Lan
%Y Celikyilmaz, Asli
%Y Geramifard, Alborz
%Y Ahuja, Aman
%Y Jiang, Haoming
%S The First Workshop on Evaluations and Assessments of Neural Conversation Systems
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F coca-etal-2021-gcdf1
%X The evaluation of dialogue systems in interaction with simulated users has been proposed to improve turn-level, corpus-based metrics which can only evaluate test cases encountered in a corpus and cannot measure system’s ability to sustain multi-turn interactions. Recently, little emphasis was put on automatically assessing the quality of the user model itself, so unless correlations with human studies are measured, the reliability of user model based evaluation is unknown. We propose GCDF1, a simple but effective measure of the quality of semantic-level conversations between a goal-driven user agent and a system agent. In contrast with previous approaches we measure the F-score at dialogue level and consider user and system behaviours to improve recall and precision estimation. We facilitate scores interpretation by providing a rich hierarchical structure with information about conversational patterns present in the test data and tools to efficiently query the conversations generated. We apply our framework to assess the performance and weaknesses of a Convlab2 user model.
%R 10.18653/v1/2021.eancs-1.2
%U https://aclanthology.org/2021.eancs-1.2
%U https://doi.org/10.18653/v1/2021.eancs-1.2
%P 7-14
Markdown (Informal)
[GCDF1: A Goal- and Context- Driven F-Score for Evaluating User Models](https://aclanthology.org/2021.eancs-1.2) (Coca et al., EANCS 2021)
ACL