@inproceedings{lyman-hepner-2024-whatif,
title = "{W}hat{I}f: Leveraging Word Vectors for Small-Scale Data Augmentation",
author = "Lyman, Alex and
Hepner, Bryce",
editor = "Hu, Michael Y. and
Mueller, Aaron and
Ross, Candace and
Williams, Adina and
Linzen, Tal and
Zhuang, Chengxu and
Choshen, Leshem and
Cotterell, Ryan and
Warstadt, Alex and
Wilcox, Ethan Gotlieb",
booktitle = "The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning",
month = nov,
year = "2024",
address = "Miami, FL, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conll-babylm.20/",
pages = "229--236",
abstract = "We introduce WhatIf, a lightly supervised data augmentation technique that leverages word vectors to enhance training data for small-scale language models. Inspired by reading prediction strategies used in education, WhatIf creates new samples by substituting semantically similar words in the training data. We evaluate WhatIf on multiple datasets, demonstrating small but consistent improvements in downstream evaluation compared to baseline models. Finally, we compare WhatIf to other small-scale data augmentation techniques and find that it provides comparable quantitative results at a potential tradeoff to qualitative evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lyman-hepner-2024-whatif">
<titleInfo>
<title>WhatIf: Leveraging Word Vectors for Small-Scale Data Augmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Lyman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryce</namePart>
<namePart type="family">Hepner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengxu</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Gotlieb</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, FL, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce WhatIf, a lightly supervised data augmentation technique that leverages word vectors to enhance training data for small-scale language models. Inspired by reading prediction strategies used in education, WhatIf creates new samples by substituting semantically similar words in the training data. We evaluate WhatIf on multiple datasets, demonstrating small but consistent improvements in downstream evaluation compared to baseline models. Finally, we compare WhatIf to other small-scale data augmentation techniques and find that it provides comparable quantitative results at a potential tradeoff to qualitative evaluation.</abstract>
<identifier type="citekey">lyman-hepner-2024-whatif</identifier>
<location>
<url>https://aclanthology.org/2024.conll-babylm.20/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>229</start>
<end>236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WhatIf: Leveraging Word Vectors for Small-Scale Data Augmentation
%A Lyman, Alex
%A Hepner, Bryce
%Y Hu, Michael Y.
%Y Mueller, Aaron
%Y Ross, Candace
%Y Williams, Adina
%Y Linzen, Tal
%Y Zhuang, Chengxu
%Y Choshen, Leshem
%Y Cotterell, Ryan
%Y Warstadt, Alex
%Y Wilcox, Ethan Gotlieb
%S The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, FL, USA
%F lyman-hepner-2024-whatif
%X We introduce WhatIf, a lightly supervised data augmentation technique that leverages word vectors to enhance training data for small-scale language models. Inspired by reading prediction strategies used in education, WhatIf creates new samples by substituting semantically similar words in the training data. We evaluate WhatIf on multiple datasets, demonstrating small but consistent improvements in downstream evaluation compared to baseline models. Finally, we compare WhatIf to other small-scale data augmentation techniques and find that it provides comparable quantitative results at a potential tradeoff to qualitative evaluation.
%U https://aclanthology.org/2024.conll-babylm.20/
%P 229-236
Markdown (Informal)
[WhatIf: Leveraging Word Vectors for Small-Scale Data Augmentation](https://aclanthology.org/2024.conll-babylm.20/) (Lyman & Hepner, CoNLL-BabyLM 2024)
ACL