@article{gupta-etal-2022-model,
title = "Is My Model Using the Right Evidence? Systematic Probes for Examining Evidence-Based Tabular Reasoning",
author = "Gupta, Vivek and
Bhat, Riyaz A. and
Ghosal, Atreya and
Shrivastava, Manish and
Singh, Maneesh and
Srikumar, Vivek",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2022.tacl-1.38",
doi = "10.1162/tacl_a_00482",
pages = "659--679",
abstract = "Neural models command state-of-the-art performance across NLP tasks, including ones involving {``}reasoning{''}. Models claiming to reason about the evidence presented to them should attend to the correct parts of the input while avoiding spurious patterns therein, be self-consistent in their predictions across inputs, and be immune to biases derived from their pre-training in a nuanced, context- sensitive fashion. Do the prevalent *BERT- family of models do so? In this paper, we study this question using the problem of reasoning on tabular data. Tabular inputs are especially well-suited for the study{---}they admit systematic probes targeting the properties listed above. Our experiments demonstrate that a RoBERTa-based model, representative of the current state-of-the-art, fails at reasoning on the following counts: it (a) ignores relevant parts of the evidence, (b) is over- sensitive to annotation artifacts, and (c) relies on the knowledge encoded in the pre-trained language model rather than the evidence presented in its tabular inputs. Finally, through inoculation experiments, we show that fine- tuning the model on perturbed data does not help it overcome the above challenges.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gupta-etal-2022-model">
<titleInfo>
<title>Is My Model Using the Right Evidence? Systematic Probes for Examining Evidence-Based Tabular Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Riyaz</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Bhat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atreya</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manish</namePart>
<namePart type="family">Shrivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maneesh</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Neural models command state-of-the-art performance across NLP tasks, including ones involving “reasoning”. Models claiming to reason about the evidence presented to them should attend to the correct parts of the input while avoiding spurious patterns therein, be self-consistent in their predictions across inputs, and be immune to biases derived from their pre-training in a nuanced, context- sensitive fashion. Do the prevalent *BERT- family of models do so? In this paper, we study this question using the problem of reasoning on tabular data. Tabular inputs are especially well-suited for the study—they admit systematic probes targeting the properties listed above. Our experiments demonstrate that a RoBERTa-based model, representative of the current state-of-the-art, fails at reasoning on the following counts: it (a) ignores relevant parts of the evidence, (b) is over- sensitive to annotation artifacts, and (c) relies on the knowledge encoded in the pre-trained language model rather than the evidence presented in its tabular inputs. Finally, through inoculation experiments, we show that fine- tuning the model on perturbed data does not help it overcome the above challenges.</abstract>
<identifier type="citekey">gupta-etal-2022-model</identifier>
<identifier type="doi">10.1162/tacl_a_00482</identifier>
<location>
<url>https://aclanthology.org/2022.tacl-1.38</url>
</location>
<part>
<date>2022</date>
<detail type="volume"><number>10</number></detail>
<extent unit="page">
<start>659</start>
<end>679</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Is My Model Using the Right Evidence? Systematic Probes for Examining Evidence-Based Tabular Reasoning
%A Gupta, Vivek
%A Bhat, Riyaz A.
%A Ghosal, Atreya
%A Shrivastava, Manish
%A Singh, Maneesh
%A Srikumar, Vivek
%J Transactions of the Association for Computational Linguistics
%D 2022
%V 10
%I MIT Press
%C Cambridge, MA
%F gupta-etal-2022-model
%X Neural models command state-of-the-art performance across NLP tasks, including ones involving “reasoning”. Models claiming to reason about the evidence presented to them should attend to the correct parts of the input while avoiding spurious patterns therein, be self-consistent in their predictions across inputs, and be immune to biases derived from their pre-training in a nuanced, context- sensitive fashion. Do the prevalent *BERT- family of models do so? In this paper, we study this question using the problem of reasoning on tabular data. Tabular inputs are especially well-suited for the study—they admit systematic probes targeting the properties listed above. Our experiments demonstrate that a RoBERTa-based model, representative of the current state-of-the-art, fails at reasoning on the following counts: it (a) ignores relevant parts of the evidence, (b) is over- sensitive to annotation artifacts, and (c) relies on the knowledge encoded in the pre-trained language model rather than the evidence presented in its tabular inputs. Finally, through inoculation experiments, we show that fine- tuning the model on perturbed data does not help it overcome the above challenges.
%R 10.1162/tacl_a_00482
%U https://aclanthology.org/2022.tacl-1.38
%U https://doi.org/10.1162/tacl_a_00482
%P 659-679
Markdown (Informal)
[Is My Model Using the Right Evidence? Systematic Probes for Examining Evidence-Based Tabular Reasoning](https://aclanthology.org/2022.tacl-1.38) (Gupta et al., TACL 2022)
ACL