@inproceedings{mitchell-etal-2022-enhancing,
title = "Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference",
author = "Mitchell, Eric and
Noh, Joseph and
Li, Siyan and
Armstrong, Will and
Agarwal, Ananth and
Liu, Patrick and
Finn, Chelsea and
Manning, Christopher",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.emnlp-main.115",
pages = "1754--1768",
abstract = "While large pre-trained language models are powerful, their predictions often lack logical consistency across test inputs. For example, a state-of-the-art Macaw question-answering (QA) model answers {\textless}i{\textgreater}Yes{\textless}/i{\textgreater} to {\textless}i{\textgreater}Is a sparrow a bird?{\textless}/i{\textgreater} and {\textless}i{\textgreater}Does a bird have feet?{\textless}/i{\textgreater} but answers {\textless}i{\textgreater}No{\textless}/i{\textgreater} to {\textless}i{\textgreater}Does a sparrow have feet?{\textless}/i{\textgreater}. To address this failure mode, we propose a framework, Consistency Correction through Relation Detection, or {\textless}b{\textgreater}ConCoRD{\textless}/b{\textgreater}, for boosting the consistency and accuracy of pre-trained NLP models using pre-trained natural language inference (NLI) models without fine-tuning or re-training. Given a batch of test inputs, ConCoRD samples several candidate outputs for each input and instantiates a factor graph that accounts for both the model{'}s belief about the likelihood of each answer choice in isolation and the NLI model{'}s beliefs about pair-wise answer choice compatibility. We show that a weighted MaxSAT solver can efficiently compute high-quality answer choices under this factor graph, improving over the raw model{'}s predictions. Our experiments demonstrate that ConCoRD consistently boosts accuracy and consistency of off-the-shelf closed-book QA and VQA models using off-the-shelf NLI models, notably increasing accuracy of LXMERT on ConVQA by 5{\%} absolute. See the project website (https://ericmitchell.ai/emnlp-2022-concord/) for code and data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mitchell-etal-2022-enhancing">
<titleInfo>
<title>Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Mitchell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Noh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siyan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Will</namePart>
<namePart type="family">Armstrong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ananth</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chelsea</namePart>
<namePart type="family">Finn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Manning</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While large pre-trained language models are powerful, their predictions often lack logical consistency across test inputs. For example, a state-of-the-art Macaw question-answering (QA) model answers \textlessi\textgreaterYes\textless/i\textgreater to \textlessi\textgreaterIs a sparrow a bird?\textless/i\textgreater and \textlessi\textgreaterDoes a bird have feet?\textless/i\textgreater but answers \textlessi\textgreaterNo\textless/i\textgreater to \textlessi\textgreaterDoes a sparrow have feet?\textless/i\textgreater. To address this failure mode, we propose a framework, Consistency Correction through Relation Detection, or \textlessb\textgreaterConCoRD\textless/b\textgreater, for boosting the consistency and accuracy of pre-trained NLP models using pre-trained natural language inference (NLI) models without fine-tuning or re-training. Given a batch of test inputs, ConCoRD samples several candidate outputs for each input and instantiates a factor graph that accounts for both the model’s belief about the likelihood of each answer choice in isolation and the NLI model’s beliefs about pair-wise answer choice compatibility. We show that a weighted MaxSAT solver can efficiently compute high-quality answer choices under this factor graph, improving over the raw model’s predictions. Our experiments demonstrate that ConCoRD consistently boosts accuracy and consistency of off-the-shelf closed-book QA and VQA models using off-the-shelf NLI models, notably increasing accuracy of LXMERT on ConVQA by 5% absolute. See the project website (https://ericmitchell.ai/emnlp-2022-concord/) for code and data.</abstract>
<identifier type="citekey">mitchell-etal-2022-enhancing</identifier>
<location>
<url>https://aclanthology.org/2022.emnlp-main.115</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1754</start>
<end>1768</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference
%A Mitchell, Eric
%A Noh, Joseph
%A Li, Siyan
%A Armstrong, Will
%A Agarwal, Ananth
%A Liu, Patrick
%A Finn, Chelsea
%A Manning, Christopher
%S Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F mitchell-etal-2022-enhancing
%X While large pre-trained language models are powerful, their predictions often lack logical consistency across test inputs. For example, a state-of-the-art Macaw question-answering (QA) model answers \textlessi\textgreaterYes\textless/i\textgreater to \textlessi\textgreaterIs a sparrow a bird?\textless/i\textgreater and \textlessi\textgreaterDoes a bird have feet?\textless/i\textgreater but answers \textlessi\textgreaterNo\textless/i\textgreater to \textlessi\textgreaterDoes a sparrow have feet?\textless/i\textgreater. To address this failure mode, we propose a framework, Consistency Correction through Relation Detection, or \textlessb\textgreaterConCoRD\textless/b\textgreater, for boosting the consistency and accuracy of pre-trained NLP models using pre-trained natural language inference (NLI) models without fine-tuning or re-training. Given a batch of test inputs, ConCoRD samples several candidate outputs for each input and instantiates a factor graph that accounts for both the model’s belief about the likelihood of each answer choice in isolation and the NLI model’s beliefs about pair-wise answer choice compatibility. We show that a weighted MaxSAT solver can efficiently compute high-quality answer choices under this factor graph, improving over the raw model’s predictions. Our experiments demonstrate that ConCoRD consistently boosts accuracy and consistency of off-the-shelf closed-book QA and VQA models using off-the-shelf NLI models, notably increasing accuracy of LXMERT on ConVQA by 5% absolute. See the project website (https://ericmitchell.ai/emnlp-2022-concord/) for code and data.
%U https://aclanthology.org/2022.emnlp-main.115
%P 1754-1768
Markdown (Informal)
[Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference](https://aclanthology.org/2022.emnlp-main.115) (Mitchell et al., EMNLP 2022)
ACL
- Eric Mitchell, Joseph Noh, Siyan Li, Will Armstrong, Ananth Agarwal, Patrick Liu, Chelsea Finn, and Christopher Manning. 2022. Enhancing Self-Consistency and Performance of Pre-Trained Language Models through Natural Language Inference. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pages 1754–1768, Abu Dhabi, United Arab Emirates. Association for Computational Linguistics.