@inproceedings{ding-etal-2022-posthoc,
title = "Posthoc Verification and the Fallibility of the Ground Truth",
author = "Ding, Yifan and
Botzer, Nicholas and
Weninger, Tim",
editor = "Bartolo, Max and
Kirk, Hannah and
Rodriguez, Pedro and
Margatina, Katerina and
Thrush, Tristan and
Jia, Robin and
Stenetorp, Pontus and
Williams, Adina and
Kiela, Douwe",
booktitle = "Proceedings of the First Workshop on Dynamic Adversarial Data Collection",
month = jul,
year = "2022",
address = "Seattle, WA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.dadc-1.3/",
doi = "10.18653/v1/2022.dadc-1.3",
pages = "23--29",
abstract = "Classifiers commonly make use of pre-annotated datasets, wherein a model is evaluated by pre-defined metrics on a held-out test set typically made of human-annotated labels. Metrics used in these evaluations are tied to the availability of well-defined ground truth labels, and these metrics typically do not allow for inexact matches. These noisy ground truth labels and strict evaluation metrics may compromise the validity and realism of evaluation results. In the present work, we conduct a systematic label verification experiment on the entity linking (EL) task. Specifically, we ask annotators to verify the correctness of annotations after the fact (, posthoc). Compared to pre-annotation evaluation, state-of-the-art EL models performed extremely well according to the posthoc evaluation methodology. Surprisingly, we find predictions from EL models had a similar or higher verification rate than the ground truth. We conclude with a discussion on these findings and recommendations for future evaluations. The source code, raw results, and evaluation scripts are publicly available via the MIT license at \url{https://github.com/yifding/e2e_EL_evaluate}"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ding-etal-2022-posthoc">
<titleInfo>
<title>Posthoc Verification and the Fallibility of the Ground Truth</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Botzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Weninger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Dynamic Adversarial Data Collection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Bartolo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hannah</namePart>
<namePart type="family">Kirk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Margatina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Thrush</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robin</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pontus</namePart>
<namePart type="family">Stenetorp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Douwe</namePart>
<namePart type="family">Kiela</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, WA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Classifiers commonly make use of pre-annotated datasets, wherein a model is evaluated by pre-defined metrics on a held-out test set typically made of human-annotated labels. Metrics used in these evaluations are tied to the availability of well-defined ground truth labels, and these metrics typically do not allow for inexact matches. These noisy ground truth labels and strict evaluation metrics may compromise the validity and realism of evaluation results. In the present work, we conduct a systematic label verification experiment on the entity linking (EL) task. Specifically, we ask annotators to verify the correctness of annotations after the fact (, posthoc). Compared to pre-annotation evaluation, state-of-the-art EL models performed extremely well according to the posthoc evaluation methodology. Surprisingly, we find predictions from EL models had a similar or higher verification rate than the ground truth. We conclude with a discussion on these findings and recommendations for future evaluations. The source code, raw results, and evaluation scripts are publicly available via the MIT license at https://github.com/yifding/e2e_EL_evaluate</abstract>
<identifier type="citekey">ding-etal-2022-posthoc</identifier>
<identifier type="doi">10.18653/v1/2022.dadc-1.3</identifier>
<location>
<url>https://aclanthology.org/2022.dadc-1.3/</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>23</start>
<end>29</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Posthoc Verification and the Fallibility of the Ground Truth
%A Ding, Yifan
%A Botzer, Nicholas
%A Weninger, Tim
%Y Bartolo, Max
%Y Kirk, Hannah
%Y Rodriguez, Pedro
%Y Margatina, Katerina
%Y Thrush, Tristan
%Y Jia, Robin
%Y Stenetorp, Pontus
%Y Williams, Adina
%Y Kiela, Douwe
%S Proceedings of the First Workshop on Dynamic Adversarial Data Collection
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, WA
%F ding-etal-2022-posthoc
%X Classifiers commonly make use of pre-annotated datasets, wherein a model is evaluated by pre-defined metrics on a held-out test set typically made of human-annotated labels. Metrics used in these evaluations are tied to the availability of well-defined ground truth labels, and these metrics typically do not allow for inexact matches. These noisy ground truth labels and strict evaluation metrics may compromise the validity and realism of evaluation results. In the present work, we conduct a systematic label verification experiment on the entity linking (EL) task. Specifically, we ask annotators to verify the correctness of annotations after the fact (, posthoc). Compared to pre-annotation evaluation, state-of-the-art EL models performed extremely well according to the posthoc evaluation methodology. Surprisingly, we find predictions from EL models had a similar or higher verification rate than the ground truth. We conclude with a discussion on these findings and recommendations for future evaluations. The source code, raw results, and evaluation scripts are publicly available via the MIT license at https://github.com/yifding/e2e_EL_evaluate
%R 10.18653/v1/2022.dadc-1.3
%U https://aclanthology.org/2022.dadc-1.3/
%U https://doi.org/10.18653/v1/2022.dadc-1.3
%P 23-29
Markdown (Informal)
[Posthoc Verification and the Fallibility of the Ground Truth](https://aclanthology.org/2022.dadc-1.3/) (Ding et al., DADC 2022)
ACL