@inproceedings{misra-mahowald-2024-language,
title = "Language Models Learn Rare Phenomena from Less Rare Phenomena: The Case of the Missing {AANN}s",
author = "Misra, Kanishka and
Mahowald, Kyle",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.53",
doi = "10.18653/v1/2024.emnlp-main.53",
pages = "913--929",
abstract = "Language models learn rare syntactic phenomena, but the extent to which this is attributable to generalization vs. memorization is a major open question. To that end, we iteratively trained transformer language models on systematically manipulated corpora which were human-scale in size, and then evaluated their learning of a rare grammatical phenomenon: the English Article+Adjective+Numeral+Noun (AANN) construction ({``}a beautiful five days{''}). We compared how well this construction was learned on the default corpus relative to a counterfactual corpus in which AANN sentences were removed. We found that AANNs were still learned better than systematically perturbed variants of the construction. Using additional counterfactual corpora, we suggest that this learning occurs through generalization from related constructions (e.g., {``}a few days{''}). An additional experiment showed that this learning is enhanced when there is more variability in the input. Taken together, our results provide an existence proof that LMs can learn rare grammatical phenomena by generalization from less rare phenomena. Data and code: https://github.com/kanishkamisra/aannalysis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="misra-mahowald-2024-language">
<titleInfo>
<title>Language Models Learn Rare Phenomena from Less Rare Phenomena: The Case of the Missing AANNs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kanishka</namePart>
<namePart type="family">Misra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Mahowald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language models learn rare syntactic phenomena, but the extent to which this is attributable to generalization vs. memorization is a major open question. To that end, we iteratively trained transformer language models on systematically manipulated corpora which were human-scale in size, and then evaluated their learning of a rare grammatical phenomenon: the English Article+Adjective+Numeral+Noun (AANN) construction (“a beautiful five days”). We compared how well this construction was learned on the default corpus relative to a counterfactual corpus in which AANN sentences were removed. We found that AANNs were still learned better than systematically perturbed variants of the construction. Using additional counterfactual corpora, we suggest that this learning occurs through generalization from related constructions (e.g., “a few days”). An additional experiment showed that this learning is enhanced when there is more variability in the input. Taken together, our results provide an existence proof that LMs can learn rare grammatical phenomena by generalization from less rare phenomena. Data and code: https://github.com/kanishkamisra/aannalysis.</abstract>
<identifier type="citekey">misra-mahowald-2024-language</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.53</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.53</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>913</start>
<end>929</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Models Learn Rare Phenomena from Less Rare Phenomena: The Case of the Missing AANNs
%A Misra, Kanishka
%A Mahowald, Kyle
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F misra-mahowald-2024-language
%X Language models learn rare syntactic phenomena, but the extent to which this is attributable to generalization vs. memorization is a major open question. To that end, we iteratively trained transformer language models on systematically manipulated corpora which were human-scale in size, and then evaluated their learning of a rare grammatical phenomenon: the English Article+Adjective+Numeral+Noun (AANN) construction (“a beautiful five days”). We compared how well this construction was learned on the default corpus relative to a counterfactual corpus in which AANN sentences were removed. We found that AANNs were still learned better than systematically perturbed variants of the construction. Using additional counterfactual corpora, we suggest that this learning occurs through generalization from related constructions (e.g., “a few days”). An additional experiment showed that this learning is enhanced when there is more variability in the input. Taken together, our results provide an existence proof that LMs can learn rare grammatical phenomena by generalization from less rare phenomena. Data and code: https://github.com/kanishkamisra/aannalysis.
%R 10.18653/v1/2024.emnlp-main.53
%U https://aclanthology.org/2024.emnlp-main.53
%U https://doi.org/10.18653/v1/2024.emnlp-main.53
%P 913-929
Markdown (Informal)
[Language Models Learn Rare Phenomena from Less Rare Phenomena: The Case of the Missing AANNs](https://aclanthology.org/2024.emnlp-main.53) (Misra & Mahowald, EMNLP 2024)
ACL