@inproceedings{sommerauer-etal-2020-describe,
title = "Would you describe a leopard as yellow? Evaluating crowd-annotations with justified and informative disagreement",
author = "Sommerauer, Pia and
Fokkens, Antske and
Vossen, Piek",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.422",
doi = "10.18653/v1/2020.coling-main.422",
pages = "4798--4809",
abstract = "Semantic annotation tasks contain ambiguity and vagueness and require varying degrees of world knowledge. Disagreement is an important indication of these phenomena. Most traditional evaluation methods, however, critically hinge upon the notion of inter-annotator agreement. While alternative frameworks have been proposed, they do not move beyond agreement as the most important indicator of quality. Critically, evaluations usually do not distinguish between instances in which agreement is expected and instances in which disagreement is not only valid but desired because it captures the linguistic and cognitive phenomena in the data. We attempt to overcome these limitations using the example of a dataset that provides semantic representations for diagnostic experiments on language models. Ambiguity, vagueness, and difficulty are not only highly relevant for this use-case, but also play an important role in other types of semantic annotation tasks. We establish an additional, agreement-independent quality metric based on answer-coherence and evaluate it in comparison to existing metrics. We compare against a gold standard and evaluate on expected disagreement. Despite generally low agreement, annotations follow expected behavior and have high accuracy when selected based on coherence. We show that combining different quality metrics enables a more comprehensive evaluation than relying exclusively on agreement.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sommerauer-etal-2020-describe">
<titleInfo>
<title>Would you describe a leopard as yellow? Evaluating crowd-annotations with justified and informative disagreement</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pia</namePart>
<namePart type="family">Sommerauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antske</namePart>
<namePart type="family">Fokkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piek</namePart>
<namePart type="family">Vossen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Semantic annotation tasks contain ambiguity and vagueness and require varying degrees of world knowledge. Disagreement is an important indication of these phenomena. Most traditional evaluation methods, however, critically hinge upon the notion of inter-annotator agreement. While alternative frameworks have been proposed, they do not move beyond agreement as the most important indicator of quality. Critically, evaluations usually do not distinguish between instances in which agreement is expected and instances in which disagreement is not only valid but desired because it captures the linguistic and cognitive phenomena in the data. We attempt to overcome these limitations using the example of a dataset that provides semantic representations for diagnostic experiments on language models. Ambiguity, vagueness, and difficulty are not only highly relevant for this use-case, but also play an important role in other types of semantic annotation tasks. We establish an additional, agreement-independent quality metric based on answer-coherence and evaluate it in comparison to existing metrics. We compare against a gold standard and evaluate on expected disagreement. Despite generally low agreement, annotations follow expected behavior and have high accuracy when selected based on coherence. We show that combining different quality metrics enables a more comprehensive evaluation than relying exclusively on agreement.</abstract>
<identifier type="citekey">sommerauer-etal-2020-describe</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.422</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.422</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>4798</start>
<end>4809</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Would you describe a leopard as yellow? Evaluating crowd-annotations with justified and informative disagreement
%A Sommerauer, Pia
%A Fokkens, Antske
%A Vossen, Piek
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F sommerauer-etal-2020-describe
%X Semantic annotation tasks contain ambiguity and vagueness and require varying degrees of world knowledge. Disagreement is an important indication of these phenomena. Most traditional evaluation methods, however, critically hinge upon the notion of inter-annotator agreement. While alternative frameworks have been proposed, they do not move beyond agreement as the most important indicator of quality. Critically, evaluations usually do not distinguish between instances in which agreement is expected and instances in which disagreement is not only valid but desired because it captures the linguistic and cognitive phenomena in the data. We attempt to overcome these limitations using the example of a dataset that provides semantic representations for diagnostic experiments on language models. Ambiguity, vagueness, and difficulty are not only highly relevant for this use-case, but also play an important role in other types of semantic annotation tasks. We establish an additional, agreement-independent quality metric based on answer-coherence and evaluate it in comparison to existing metrics. We compare against a gold standard and evaluate on expected disagreement. Despite generally low agreement, annotations follow expected behavior and have high accuracy when selected based on coherence. We show that combining different quality metrics enables a more comprehensive evaluation than relying exclusively on agreement.
%R 10.18653/v1/2020.coling-main.422
%U https://aclanthology.org/2020.coling-main.422
%U https://doi.org/10.18653/v1/2020.coling-main.422
%P 4798-4809
Markdown (Informal)
[Would you describe a leopard as yellow? Evaluating crowd-annotations with justified and informative disagreement](https://aclanthology.org/2020.coling-main.422) (Sommerauer et al., COLING 2020)
ACL