@inproceedings{bandel-etal-2022-lexical,
title = "Lexical Generalization Improves with Larger Models and Longer Training",
author = "Bandel, Elron and
Goldberg, Yoav and
Elazar, Yanai",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.323",
doi = "10.18653/v1/2022.findings-emnlp.323",
pages = "4398--4410",
abstract = "While fine-tuned language models perform well on many language tasks, they were also shown to rely on superficial surface features such as lexical overlap. Excessive utilization of such heuristics can lead to failure on challenging inputs. We analyze the use of lexical overlap heuristics in natural language inference, paraphrase detection, and reading comprehension (using a novel contrastive dataset),and find that larger models are much less susceptible to adopting lexical overlap heuristics. We also find that longer training leads models to abandon lexical overlap heuristics. Finally, We provide evidence that the disparity between models size has its source in the pre-trained model.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bandel-etal-2022-lexical">
<titleInfo>
<title>Lexical Generalization Improves with Larger Models and Longer Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elron</namePart>
<namePart type="family">Bandel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanai</namePart>
<namePart type="family">Elazar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While fine-tuned language models perform well on many language tasks, they were also shown to rely on superficial surface features such as lexical overlap. Excessive utilization of such heuristics can lead to failure on challenging inputs. We analyze the use of lexical overlap heuristics in natural language inference, paraphrase detection, and reading comprehension (using a novel contrastive dataset),and find that larger models are much less susceptible to adopting lexical overlap heuristics. We also find that longer training leads models to abandon lexical overlap heuristics. Finally, We provide evidence that the disparity between models size has its source in the pre-trained model.</abstract>
<identifier type="citekey">bandel-etal-2022-lexical</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.323</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.323</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>4398</start>
<end>4410</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lexical Generalization Improves with Larger Models and Longer Training
%A Bandel, Elron
%A Goldberg, Yoav
%A Elazar, Yanai
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F bandel-etal-2022-lexical
%X While fine-tuned language models perform well on many language tasks, they were also shown to rely on superficial surface features such as lexical overlap. Excessive utilization of such heuristics can lead to failure on challenging inputs. We analyze the use of lexical overlap heuristics in natural language inference, paraphrase detection, and reading comprehension (using a novel contrastive dataset),and find that larger models are much less susceptible to adopting lexical overlap heuristics. We also find that longer training leads models to abandon lexical overlap heuristics. Finally, We provide evidence that the disparity between models size has its source in the pre-trained model.
%R 10.18653/v1/2022.findings-emnlp.323
%U https://aclanthology.org/2022.findings-emnlp.323
%U https://doi.org/10.18653/v1/2022.findings-emnlp.323
%P 4398-4410
Markdown (Informal)
[Lexical Generalization Improves with Larger Models and Longer Training](https://aclanthology.org/2022.findings-emnlp.323) (Bandel et al., Findings 2022)
ACL