@inproceedings{niven-kao-2019-probing,
title = "Probing Neural Network Comprehension of Natural Language Arguments",
author = "Niven, Timothy and
Kao, Hung-Yu",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'\i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1459",
doi = "10.18653/v1/P19-1459",
pages = "4658--4664",
abstract = "We are surprised to find that BERT{'}s peak performance of 77{\%} on the Argument Reasoning Comprehension Task reaches just three points below the average untrained human baseline. However, we show that this result is entirely accounted for by exploitation of spurious statistical cues in the dataset. We analyze the nature of these cues and demonstrate that a range of models all exploit them. This analysis informs the construction of an adversarial dataset on which all models achieve random accuracy. Our adversarial dataset provides a more robust assessment of argument comprehension and should be adopted as the standard in future work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="niven-kao-2019-probing">
<titleInfo>
<title>Probing Neural Network Comprehension of Natural Language Arguments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Niven</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hung-Yu</namePart>
<namePart type="family">Kao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We are surprised to find that BERT’s peak performance of 77% on the Argument Reasoning Comprehension Task reaches just three points below the average untrained human baseline. However, we show that this result is entirely accounted for by exploitation of spurious statistical cues in the dataset. We analyze the nature of these cues and demonstrate that a range of models all exploit them. This analysis informs the construction of an adversarial dataset on which all models achieve random accuracy. Our adversarial dataset provides a more robust assessment of argument comprehension and should be adopted as the standard in future work.</abstract>
<identifier type="citekey">niven-kao-2019-probing</identifier>
<identifier type="doi">10.18653/v1/P19-1459</identifier>
<location>
<url>https://aclanthology.org/P19-1459</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>4658</start>
<end>4664</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Probing Neural Network Comprehension of Natural Language Arguments
%A Niven, Timothy
%A Kao, Hung-Yu
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F niven-kao-2019-probing
%X We are surprised to find that BERT’s peak performance of 77% on the Argument Reasoning Comprehension Task reaches just three points below the average untrained human baseline. However, we show that this result is entirely accounted for by exploitation of spurious statistical cues in the dataset. We analyze the nature of these cues and demonstrate that a range of models all exploit them. This analysis informs the construction of an adversarial dataset on which all models achieve random accuracy. Our adversarial dataset provides a more robust assessment of argument comprehension and should be adopted as the standard in future work.
%R 10.18653/v1/P19-1459
%U https://aclanthology.org/P19-1459
%U https://doi.org/10.18653/v1/P19-1459
%P 4658-4664
Markdown (Informal)
[Probing Neural Network Comprehension of Natural Language Arguments](https://aclanthology.org/P19-1459) (Niven & Kao, ACL 2019)
ACL