@article{paun-etal-2018-comparing,
title = "Comparing {B}ayesian Models of Annotation",
author = "Paun, Silviu and
Carpenter, Bob and
Chamberlain, Jon and
Hovy, Dirk and
Kruschwitz, Udo and
Poesio, Massimo",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina and
Roark, Brian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "6",
year = "2018",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q18-1040/",
doi = "10.1162/tacl_a_00040",
pages = "571--585",
abstract = "The analysis of crowdsourced annotations in natural language processing is concerned with identifying (1) gold standard labels, (2) annotator accuracies and biases, and (3) item difficulties and error patterns. Traditionally, majority voting was used for 1, and coefficients of agreement for 2 and 3. Lately, model-based analysis of corpus annotations have proven better at all three tasks. But there has been relatively little work comparing them on the same datasets. This paper aims to fill this gap by analyzing six models of annotation, covering different approaches to annotator ability, item difficulty, and parameter pooling (tying) across annotators and items. We evaluate these models along four aspects: comparison to gold labels, predictive accuracy for new annotations, annotator characterization, and item difficulty, using four datasets with varying degrees of noise in the form of random (spammy) annotators. We conclude with guidelines for model selection, application, and implementation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="paun-etal-2018-comparing">
<titleInfo>
<title>Comparing Bayesian Models of Annotation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Silviu</namePart>
<namePart type="family">Paun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bob</namePart>
<namePart type="family">Carpenter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Chamberlain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dirk</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Udo</namePart>
<namePart type="family">Kruschwitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Massimo</namePart>
<namePart type="family">Poesio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>The analysis of crowdsourced annotations in natural language processing is concerned with identifying (1) gold standard labels, (2) annotator accuracies and biases, and (3) item difficulties and error patterns. Traditionally, majority voting was used for 1, and coefficients of agreement for 2 and 3. Lately, model-based analysis of corpus annotations have proven better at all three tasks. But there has been relatively little work comparing them on the same datasets. This paper aims to fill this gap by analyzing six models of annotation, covering different approaches to annotator ability, item difficulty, and parameter pooling (tying) across annotators and items. We evaluate these models along four aspects: comparison to gold labels, predictive accuracy for new annotations, annotator characterization, and item difficulty, using four datasets with varying degrees of noise in the form of random (spammy) annotators. We conclude with guidelines for model selection, application, and implementation.</abstract>
<identifier type="citekey">paun-etal-2018-comparing</identifier>
<identifier type="doi">10.1162/tacl_a_00040</identifier>
<location>
<url>https://aclanthology.org/Q18-1040/</url>
</location>
<part>
<date>2018</date>
<detail type="volume"><number>6</number></detail>
<extent unit="page">
<start>571</start>
<end>585</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Comparing Bayesian Models of Annotation
%A Paun, Silviu
%A Carpenter, Bob
%A Chamberlain, Jon
%A Hovy, Dirk
%A Kruschwitz, Udo
%A Poesio, Massimo
%J Transactions of the Association for Computational Linguistics
%D 2018
%V 6
%I MIT Press
%C Cambridge, MA
%F paun-etal-2018-comparing
%X The analysis of crowdsourced annotations in natural language processing is concerned with identifying (1) gold standard labels, (2) annotator accuracies and biases, and (3) item difficulties and error patterns. Traditionally, majority voting was used for 1, and coefficients of agreement for 2 and 3. Lately, model-based analysis of corpus annotations have proven better at all three tasks. But there has been relatively little work comparing them on the same datasets. This paper aims to fill this gap by analyzing six models of annotation, covering different approaches to annotator ability, item difficulty, and parameter pooling (tying) across annotators and items. We evaluate these models along four aspects: comparison to gold labels, predictive accuracy for new annotations, annotator characterization, and item difficulty, using four datasets with varying degrees of noise in the form of random (spammy) annotators. We conclude with guidelines for model selection, application, and implementation.
%R 10.1162/tacl_a_00040
%U https://aclanthology.org/Q18-1040/
%U https://doi.org/10.1162/tacl_a_00040
%P 571-585
Markdown (Informal)
[Comparing Bayesian Models of Annotation](https://aclanthology.org/Q18-1040/) (Paun et al., TACL 2018)
ACL
- Silviu Paun, Bob Carpenter, Jon Chamberlain, Dirk Hovy, Udo Kruschwitz, and Massimo Poesio. 2018. Comparing Bayesian Models of Annotation. Transactions of the Association for Computational Linguistics, 6:571–585.