@inproceedings{eisenstadt-elhadad-2021-evaluation,
title = "Evaluation Guidelines to Deal with Implicit Phenomena to Assess Factuality in Data-to-Text Generation",
author = "Eisenstadt, Roy and
Elhadad, Michael",
editor = "Roth, Michael and
Tsarfaty, Reut and
Goldberg, Yoav",
booktitle = "Proceedings of the 1st Workshop on Understanding Implicit and Underspecified Language",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.unimplicit-1.3",
doi = "10.18653/v1/2021.unimplicit-1.3",
pages = "20--27",
abstract = "Data-to-text generation systems are trained on large datasets, such as WebNLG, Ro-toWire, E2E or DART. Beyond traditional token-overlap evaluation metrics (BLEU or METEOR), a key concern faced by recent generators is to control the factuality of the generated text with respect to the input data specification. We report on our experience when developing an automatic factuality evaluation system for data-to-text generation that we are testing on WebNLG and E2E data. We aim to prepare gold data annotated manually to identify cases where the text communicates more information than is warranted based on the in-put data (extra) or fails to communicate data that is part of the input (missing). While analyzing reference (data, text) samples, we encountered a range of systematic uncertainties that are related to cases on implicit phenomena in text, and the nature of non-linguistic knowledge we expect to be involved when assessing factuality. We derive from our experience a set of evaluation guidelines to reach high inter-annotator agreement on such cases.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eisenstadt-elhadad-2021-evaluation">
<titleInfo>
<title>Evaluation Guidelines to Deal with Implicit Phenomena to Assess Factuality in Data-to-Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roy</namePart>
<namePart type="family">Eisenstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Elhadad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Understanding Implicit and Underspecified Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Roth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data-to-text generation systems are trained on large datasets, such as WebNLG, Ro-toWire, E2E or DART. Beyond traditional token-overlap evaluation metrics (BLEU or METEOR), a key concern faced by recent generators is to control the factuality of the generated text with respect to the input data specification. We report on our experience when developing an automatic factuality evaluation system for data-to-text generation that we are testing on WebNLG and E2E data. We aim to prepare gold data annotated manually to identify cases where the text communicates more information than is warranted based on the in-put data (extra) or fails to communicate data that is part of the input (missing). While analyzing reference (data, text) samples, we encountered a range of systematic uncertainties that are related to cases on implicit phenomena in text, and the nature of non-linguistic knowledge we expect to be involved when assessing factuality. We derive from our experience a set of evaluation guidelines to reach high inter-annotator agreement on such cases.</abstract>
<identifier type="citekey">eisenstadt-elhadad-2021-evaluation</identifier>
<identifier type="doi">10.18653/v1/2021.unimplicit-1.3</identifier>
<location>
<url>https://aclanthology.org/2021.unimplicit-1.3</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>20</start>
<end>27</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluation Guidelines to Deal with Implicit Phenomena to Assess Factuality in Data-to-Text Generation
%A Eisenstadt, Roy
%A Elhadad, Michael
%Y Roth, Michael
%Y Tsarfaty, Reut
%Y Goldberg, Yoav
%S Proceedings of the 1st Workshop on Understanding Implicit and Underspecified Language
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F eisenstadt-elhadad-2021-evaluation
%X Data-to-text generation systems are trained on large datasets, such as WebNLG, Ro-toWire, E2E or DART. Beyond traditional token-overlap evaluation metrics (BLEU or METEOR), a key concern faced by recent generators is to control the factuality of the generated text with respect to the input data specification. We report on our experience when developing an automatic factuality evaluation system for data-to-text generation that we are testing on WebNLG and E2E data. We aim to prepare gold data annotated manually to identify cases where the text communicates more information than is warranted based on the in-put data (extra) or fails to communicate data that is part of the input (missing). While analyzing reference (data, text) samples, we encountered a range of systematic uncertainties that are related to cases on implicit phenomena in text, and the nature of non-linguistic knowledge we expect to be involved when assessing factuality. We derive from our experience a set of evaluation guidelines to reach high inter-annotator agreement on such cases.
%R 10.18653/v1/2021.unimplicit-1.3
%U https://aclanthology.org/2021.unimplicit-1.3
%U https://doi.org/10.18653/v1/2021.unimplicit-1.3
%P 20-27
Markdown (Informal)
[Evaluation Guidelines to Deal with Implicit Phenomena to Assess Factuality in Data-to-Text Generation](https://aclanthology.org/2021.unimplicit-1.3) (Eisenstadt & Elhadad, unimplicit 2021)
ACL