@inproceedings{lotfi-etal-2022-name,
title = "What Was Your Name Again? Interrogating Generative Conversational Models For Factual Consistency Evaluation",
author = "Lotfi, Ehsan and
De Bruyn, Maxime and
Buhmann, Jeska and
Daelemans, Walter",
editor = "Bosselut, Antoine and
Chandu, Khyathi and
Dhole, Kaustubh and
Gangal, Varun and
Gehrmann, Sebastian and
Jernite, Yacine and
Novikova, Jekaterina and
Perez-Beltrachini, Laura",
booktitle = "Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.gem-1.47",
doi = "10.18653/v1/2022.gem-1.47",
pages = "509--519",
abstract = "Generative conversational agents are known to suffer from problems like inconsistency and hallucination, and a big challenge in studying these issues remains evaluation: they are not properly reflected in common text generation metrics like perplexity or BLEU, and alternative implicit methods like semantic similarity or NLI labels can be misguided when few specific tokens are decisive. In this work we propose ConsisTest; a factual consistency benchmark including both WH and Y/N questions based on PersonaChat, along with a hybrid evaluation pipeline which aims to get the best of symbolic and sub-symbolic methods. Using these and focusing on pretrained generative models like BART, we provide detailed statistics and analysis on how the model{'}s consistency is affected by variations in question and context.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lotfi-etal-2022-name">
<titleInfo>
<title>What Was Your Name Again? Interrogating Generative Conversational Models For Factual Consistency Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehsan</namePart>
<namePart type="family">Lotfi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxime</namePart>
<namePart type="family">De Bruyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeska</namePart>
<namePart type="family">Buhmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Walter</namePart>
<namePart type="family">Daelemans</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antoine</namePart>
<namePart type="family">Bosselut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyathi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Gangal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yacine</namePart>
<namePart type="family">Jernite</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jekaterina</namePart>
<namePart type="family">Novikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Perez-Beltrachini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generative conversational agents are known to suffer from problems like inconsistency and hallucination, and a big challenge in studying these issues remains evaluation: they are not properly reflected in common text generation metrics like perplexity or BLEU, and alternative implicit methods like semantic similarity or NLI labels can be misguided when few specific tokens are decisive. In this work we propose ConsisTest; a factual consistency benchmark including both WH and Y/N questions based on PersonaChat, along with a hybrid evaluation pipeline which aims to get the best of symbolic and sub-symbolic methods. Using these and focusing on pretrained generative models like BART, we provide detailed statistics and analysis on how the model’s consistency is affected by variations in question and context.</abstract>
<identifier type="citekey">lotfi-etal-2022-name</identifier>
<identifier type="doi">10.18653/v1/2022.gem-1.47</identifier>
<location>
<url>https://aclanthology.org/2022.gem-1.47</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>509</start>
<end>519</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What Was Your Name Again? Interrogating Generative Conversational Models For Factual Consistency Evaluation
%A Lotfi, Ehsan
%A De Bruyn, Maxime
%A Buhmann, Jeska
%A Daelemans, Walter
%Y Bosselut, Antoine
%Y Chandu, Khyathi
%Y Dhole, Kaustubh
%Y Gangal, Varun
%Y Gehrmann, Sebastian
%Y Jernite, Yacine
%Y Novikova, Jekaterina
%Y Perez-Beltrachini, Laura
%S Proceedings of the 2nd Workshop on Natural Language Generation, Evaluation, and Metrics (GEM)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F lotfi-etal-2022-name
%X Generative conversational agents are known to suffer from problems like inconsistency and hallucination, and a big challenge in studying these issues remains evaluation: they are not properly reflected in common text generation metrics like perplexity or BLEU, and alternative implicit methods like semantic similarity or NLI labels can be misguided when few specific tokens are decisive. In this work we propose ConsisTest; a factual consistency benchmark including both WH and Y/N questions based on PersonaChat, along with a hybrid evaluation pipeline which aims to get the best of symbolic and sub-symbolic methods. Using these and focusing on pretrained generative models like BART, we provide detailed statistics and analysis on how the model’s consistency is affected by variations in question and context.
%R 10.18653/v1/2022.gem-1.47
%U https://aclanthology.org/2022.gem-1.47
%U https://doi.org/10.18653/v1/2022.gem-1.47
%P 509-519
Markdown (Informal)
[What Was Your Name Again? Interrogating Generative Conversational Models For Factual Consistency Evaluation](https://aclanthology.org/2022.gem-1.47) (Lotfi et al., GEM 2022)
ACL