@inproceedings{nair-etal-2024-babylm,
title = "{B}aby{LM} Challenge: Experimenting with Self-Distillation and Reverse-Distillation for Language Model Pre-Training on Constrained Datasets",
author = "Nair, Aakarsh and
Hancharova, Alina and
Kumar, Mayank and
Gharaee, Ali",
editor = "Hu, Michael Y. and
Mueller, Aaron and
Ross, Candace and
Williams, Adina and
Linzen, Tal and
Zhuang, Chengxu and
Choshen, Leshem and
Cotterell, Ryan and
Warstadt, Alex and
Wilcox, Ethan Gotlieb",
booktitle = "The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning",
month = nov,
year = "2024",
address = "Miami, FL, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.conll-babylm.3/",
pages = "28--36",
abstract = "Language models (LMs) exhibit significant data inefficiency compared to human learners. A child is able to master language while consuming less than 100 million words of input, while language models require orders of magnitude more tokens during training. Our submission to the BabyLM Challenge utilizes a combination of self-distillation and reverse-distillation to train a sequence of ensemble models with improved training characteristics on a fixed-size 10 million-word dataset. Self-distillation is used to generate an ensemble of models of a certain fixed size, while reverse distillation is used to train a more expressive larger model from a previously trained generation of relatively smaller models, while largely preserving learned accuracy.We find that ensembles consisting of two smaller models and one identical born-again model serve as ideal ensembles for each trained generation of model size. We demonstrate that, although our method is not novel, it provides consistent and modest performance improvements on the BLiMP and GLUE benchmarks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nair-etal-2024-babylm">
<titleInfo>
<title>BabyLM Challenge: Experimenting with Self-Distillation and Reverse-Distillation for Language Model Pre-Training on Constrained Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aakarsh</namePart>
<namePart type="family">Nair</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alina</namePart>
<namePart type="family">Hancharova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayank</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Gharaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Mueller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Candace</namePart>
<namePart type="family">Ross</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adina</namePart>
<namePart type="family">Williams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengxu</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="given">Gotlieb</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, FL, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language models (LMs) exhibit significant data inefficiency compared to human learners. A child is able to master language while consuming less than 100 million words of input, while language models require orders of magnitude more tokens during training. Our submission to the BabyLM Challenge utilizes a combination of self-distillation and reverse-distillation to train a sequence of ensemble models with improved training characteristics on a fixed-size 10 million-word dataset. Self-distillation is used to generate an ensemble of models of a certain fixed size, while reverse distillation is used to train a more expressive larger model from a previously trained generation of relatively smaller models, while largely preserving learned accuracy.We find that ensembles consisting of two smaller models and one identical born-again model serve as ideal ensembles for each trained generation of model size. We demonstrate that, although our method is not novel, it provides consistent and modest performance improvements on the BLiMP and GLUE benchmarks.</abstract>
<identifier type="citekey">nair-etal-2024-babylm</identifier>
<location>
<url>https://aclanthology.org/2024.conll-babylm.3/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>28</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BabyLM Challenge: Experimenting with Self-Distillation and Reverse-Distillation for Language Model Pre-Training on Constrained Datasets
%A Nair, Aakarsh
%A Hancharova, Alina
%A Kumar, Mayank
%A Gharaee, Ali
%Y Hu, Michael Y.
%Y Mueller, Aaron
%Y Ross, Candace
%Y Williams, Adina
%Y Linzen, Tal
%Y Zhuang, Chengxu
%Y Choshen, Leshem
%Y Cotterell, Ryan
%Y Warstadt, Alex
%Y Wilcox, Ethan Gotlieb
%S The 2nd BabyLM Challenge at the 28th Conference on Computational Natural Language Learning
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, FL, USA
%F nair-etal-2024-babylm
%X Language models (LMs) exhibit significant data inefficiency compared to human learners. A child is able to master language while consuming less than 100 million words of input, while language models require orders of magnitude more tokens during training. Our submission to the BabyLM Challenge utilizes a combination of self-distillation and reverse-distillation to train a sequence of ensemble models with improved training characteristics on a fixed-size 10 million-word dataset. Self-distillation is used to generate an ensemble of models of a certain fixed size, while reverse distillation is used to train a more expressive larger model from a previously trained generation of relatively smaller models, while largely preserving learned accuracy.We find that ensembles consisting of two smaller models and one identical born-again model serve as ideal ensembles for each trained generation of model size. We demonstrate that, although our method is not novel, it provides consistent and modest performance improvements on the BLiMP and GLUE benchmarks.
%U https://aclanthology.org/2024.conll-babylm.3/
%P 28-36
Markdown (Informal)
[BabyLM Challenge: Experimenting with Self-Distillation and Reverse-Distillation for Language Model Pre-Training on Constrained Datasets](https://aclanthology.org/2024.conll-babylm.3/) (Nair et al., CoNLL-BabyLM 2024)
ACL