@inproceedings{richie-etal-2022-inter,
title = "Inter-annotator agreement is not the ceiling of machine learning performance: Evidence from a comprehensive set of simulations",
author = "Richie, Russell and
Grover, Sachin and
Tsui, Fuchiang (Rich)",
editor = "Demner-Fushman, Dina and
Cohen, Kevin Bretonnel and
Ananiadou, Sophia and
Tsujii, Junichi",
booktitle = "Proceedings of the 21st Workshop on Biomedical Language Processing",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.bionlp-1.26",
doi = "10.18653/v1/2022.bionlp-1.26",
pages = "275--284",
abstract = "It is commonly claimed that inter-annotator agreement (IAA) is the ceiling of machine learning (ML) performance, i.e., that the agreement between an ML system{'}s predictions and an annotator can not be higher than the agreement between two annotators. Although Boguslav {\&} Cohen (2017) showed that this claim is falsified by many real-world ML systems, the claim has persisted. As a complement to this real-world evidence, we conducted a comprehensive set of simulations, and show that an ML model can beat IAA even if (and especially if) annotators are noisy and differ in their underlying classification functions, as long as the ML model is reasonably well-specified. Although the latter condition has long been elusive, leading ML models to underperform IAA, we anticipate that this condition will be increasingly met in the era of big data and deep learning. Our work has implications for (1) maximizing the value of machine learning, (2) adherence to ethical standards in computing, and (3) economical use of annotated resources, which is paramount in settings where annotation is especially expensive, like biomedical natural language processing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="richie-etal-2022-inter">
<titleInfo>
<title>Inter-annotator agreement is not the ceiling of machine learning performance: Evidence from a comprehensive set of simulations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Russell</namePart>
<namePart type="family">Richie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sachin</namePart>
<namePart type="family">Grover</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fuchiang</namePart>
<namePart type="given">(Rich)</namePart>
<namePart type="family">Tsui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Workshop on Biomedical Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">Bretonnel</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>It is commonly claimed that inter-annotator agreement (IAA) is the ceiling of machine learning (ML) performance, i.e., that the agreement between an ML system’s predictions and an annotator can not be higher than the agreement between two annotators. Although Boguslav & Cohen (2017) showed that this claim is falsified by many real-world ML systems, the claim has persisted. As a complement to this real-world evidence, we conducted a comprehensive set of simulations, and show that an ML model can beat IAA even if (and especially if) annotators are noisy and differ in their underlying classification functions, as long as the ML model is reasonably well-specified. Although the latter condition has long been elusive, leading ML models to underperform IAA, we anticipate that this condition will be increasingly met in the era of big data and deep learning. Our work has implications for (1) maximizing the value of machine learning, (2) adherence to ethical standards in computing, and (3) economical use of annotated resources, which is paramount in settings where annotation is especially expensive, like biomedical natural language processing.</abstract>
<identifier type="citekey">richie-etal-2022-inter</identifier>
<identifier type="doi">10.18653/v1/2022.bionlp-1.26</identifier>
<location>
<url>https://aclanthology.org/2022.bionlp-1.26</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>275</start>
<end>284</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Inter-annotator agreement is not the ceiling of machine learning performance: Evidence from a comprehensive set of simulations
%A Richie, Russell
%A Grover, Sachin
%A Tsui, Fuchiang (Rich)
%Y Demner-Fushman, Dina
%Y Cohen, Kevin Bretonnel
%Y Ananiadou, Sophia
%Y Tsujii, Junichi
%S Proceedings of the 21st Workshop on Biomedical Language Processing
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F richie-etal-2022-inter
%X It is commonly claimed that inter-annotator agreement (IAA) is the ceiling of machine learning (ML) performance, i.e., that the agreement between an ML system’s predictions and an annotator can not be higher than the agreement between two annotators. Although Boguslav & Cohen (2017) showed that this claim is falsified by many real-world ML systems, the claim has persisted. As a complement to this real-world evidence, we conducted a comprehensive set of simulations, and show that an ML model can beat IAA even if (and especially if) annotators are noisy and differ in their underlying classification functions, as long as the ML model is reasonably well-specified. Although the latter condition has long been elusive, leading ML models to underperform IAA, we anticipate that this condition will be increasingly met in the era of big data and deep learning. Our work has implications for (1) maximizing the value of machine learning, (2) adherence to ethical standards in computing, and (3) economical use of annotated resources, which is paramount in settings where annotation is especially expensive, like biomedical natural language processing.
%R 10.18653/v1/2022.bionlp-1.26
%U https://aclanthology.org/2022.bionlp-1.26
%U https://doi.org/10.18653/v1/2022.bionlp-1.26
%P 275-284
Markdown (Informal)
[Inter-annotator agreement is not the ceiling of machine learning performance: Evidence from a comprehensive set of simulations](https://aclanthology.org/2022.bionlp-1.26) (Richie et al., BioNLP 2022)
ACL