@article{pavlick-kwiatkowski-2019-inherent,
title = "Inherent Disagreements in Human Textual Inferences",
author = "Pavlick, Ellie and
Kwiatkowski, Tom",
editor = "Lee, Lillian and
Johnson, Mark and
Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "7",
year = "2019",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q19-1043",
doi = "10.1162/tacl_a_00293",
pages = "677--694",
abstract = "We analyze human{'}s disagreements about the validity of natural language inferences. We show that, very often, disagreements are not dismissible as annotation {``}noise{''}, but rather persist as we collect more ratings and as we vary the amount of context provided to raters. We further show that the type of uncertainty captured by current state-of-the-art models for natural language inference is not reflective of the type of uncertainty present in human disagreements. We discuss implications of our results in relation to the recognizing textual entailment (RTE)/natural language inference (NLI) task. We argue for a refined evaluation objective that requires models to explicitly capture the full distribution of plausible human judgments.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pavlick-kwiatkowski-2019-inherent">
<titleInfo>
<title>Inherent Disagreements in Human Textual Inferences</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ellie</namePart>
<namePart type="family">Pavlick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kwiatkowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We analyze human’s disagreements about the validity of natural language inferences. We show that, very often, disagreements are not dismissible as annotation “noise”, but rather persist as we collect more ratings and as we vary the amount of context provided to raters. We further show that the type of uncertainty captured by current state-of-the-art models for natural language inference is not reflective of the type of uncertainty present in human disagreements. We discuss implications of our results in relation to the recognizing textual entailment (RTE)/natural language inference (NLI) task. We argue for a refined evaluation objective that requires models to explicitly capture the full distribution of plausible human judgments.</abstract>
<identifier type="citekey">pavlick-kwiatkowski-2019-inherent</identifier>
<identifier type="doi">10.1162/tacl_a_00293</identifier>
<location>
<url>https://aclanthology.org/Q19-1043</url>
</location>
<part>
<date>2019</date>
<detail type="volume"><number>7</number></detail>
<extent unit="page">
<start>677</start>
<end>694</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Inherent Disagreements in Human Textual Inferences
%A Pavlick, Ellie
%A Kwiatkowski, Tom
%J Transactions of the Association for Computational Linguistics
%D 2019
%V 7
%I MIT Press
%C Cambridge, MA
%F pavlick-kwiatkowski-2019-inherent
%X We analyze human’s disagreements about the validity of natural language inferences. We show that, very often, disagreements are not dismissible as annotation “noise”, but rather persist as we collect more ratings and as we vary the amount of context provided to raters. We further show that the type of uncertainty captured by current state-of-the-art models for natural language inference is not reflective of the type of uncertainty present in human disagreements. We discuss implications of our results in relation to the recognizing textual entailment (RTE)/natural language inference (NLI) task. We argue for a refined evaluation objective that requires models to explicitly capture the full distribution of plausible human judgments.
%R 10.1162/tacl_a_00293
%U https://aclanthology.org/Q19-1043
%U https://doi.org/10.1162/tacl_a_00293
%P 677-694
Markdown (Informal)
[Inherent Disagreements in Human Textual Inferences](https://aclanthology.org/Q19-1043) (Pavlick & Kwiatkowski, TACL 2019)
ACL