@inproceedings{longpre-etal-2024-pretrainers,
title = "A Pretrainer`s Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, {\&} Toxicity",
author = "Longpre, Shayne and
Yauney, Gregory and
Reif, Emily and
Lee, Katherine and
Roberts, Adam and
Zoph, Barret and
Zhou, Denny and
Wei, Jason and
Robinson, Kevin and
Mimno, David and
Ippolito, Daphne",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.179/",
doi = "10.18653/v1/2024.naacl-long.179",
pages = "3245--3276",
abstract = "Pretraining data design is critically under-documented and often guided by empirically unsupported intuitions. We pretrain models on data curated (1) at different collection times, (2) with varying toxicity and quality filters, and (3) with different domain compositions. First, we find that temporal shift between evaluation data and pretraining data leads to performance degradation, which is not overcome by finetuning. Second, we measure the effect of quality and toxicity filters, showing a trade-off between performance on standard benchmarks and risk of toxic generations. We also find that the effects of different types of filtering are not predictable from text domain characteristics. Third, we empirically validate that heterogeneous data sources, like books and web, are beneficial and warrant greater prioritization. To date, these experiments constitute the single largest publicly documented empirical study of the effects of pretraining data. Spanning 28 unique 1.5 billion parameter models pretrained from scratch, these findings validate, quantify, and expose many undocumented intuitions about text pretraining, which ultimately support more informed data-centric decisions in model development."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="longpre-etal-2024-pretrainers">
<titleInfo>
<title>A Pretrainer‘s Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shayne</namePart>
<namePart type="family">Longpre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gregory</namePart>
<namePart type="family">Yauney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Reif</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katherine</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barret</namePart>
<namePart type="family">Zoph</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denny</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Robinson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Mimno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Pretraining data design is critically under-documented and often guided by empirically unsupported intuitions. We pretrain models on data curated (1) at different collection times, (2) with varying toxicity and quality filters, and (3) with different domain compositions. First, we find that temporal shift between evaluation data and pretraining data leads to performance degradation, which is not overcome by finetuning. Second, we measure the effect of quality and toxicity filters, showing a trade-off between performance on standard benchmarks and risk of toxic generations. We also find that the effects of different types of filtering are not predictable from text domain characteristics. Third, we empirically validate that heterogeneous data sources, like books and web, are beneficial and warrant greater prioritization. To date, these experiments constitute the single largest publicly documented empirical study of the effects of pretraining data. Spanning 28 unique 1.5 billion parameter models pretrained from scratch, these findings validate, quantify, and expose many undocumented intuitions about text pretraining, which ultimately support more informed data-centric decisions in model development.</abstract>
<identifier type="citekey">longpre-etal-2024-pretrainers</identifier>
<identifier type="doi">10.18653/v1/2024.naacl-long.179</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.179/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3245</start>
<end>3276</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Pretrainer‘s Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity
%A Longpre, Shayne
%A Yauney, Gregory
%A Reif, Emily
%A Lee, Katherine
%A Roberts, Adam
%A Zoph, Barret
%A Zhou, Denny
%A Wei, Jason
%A Robinson, Kevin
%A Mimno, David
%A Ippolito, Daphne
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F longpre-etal-2024-pretrainers
%X Pretraining data design is critically under-documented and often guided by empirically unsupported intuitions. We pretrain models on data curated (1) at different collection times, (2) with varying toxicity and quality filters, and (3) with different domain compositions. First, we find that temporal shift between evaluation data and pretraining data leads to performance degradation, which is not overcome by finetuning. Second, we measure the effect of quality and toxicity filters, showing a trade-off between performance on standard benchmarks and risk of toxic generations. We also find that the effects of different types of filtering are not predictable from text domain characteristics. Third, we empirically validate that heterogeneous data sources, like books and web, are beneficial and warrant greater prioritization. To date, these experiments constitute the single largest publicly documented empirical study of the effects of pretraining data. Spanning 28 unique 1.5 billion parameter models pretrained from scratch, these findings validate, quantify, and expose many undocumented intuitions about text pretraining, which ultimately support more informed data-centric decisions in model development.
%R 10.18653/v1/2024.naacl-long.179
%U https://aclanthology.org/2024.naacl-long.179/
%U https://doi.org/10.18653/v1/2024.naacl-long.179
%P 3245-3276
Markdown (Informal)
[A Pretrainer’s Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity](https://aclanthology.org/2024.naacl-long.179/) (Longpre et al., NAACL 2024)
ACL
- Shayne Longpre, Gregory Yauney, Emily Reif, Katherine Lee, Adam Roberts, Barret Zoph, Denny Zhou, Jason Wei, Kevin Robinson, David Mimno, and Daphne Ippolito. 2024. A Pretrainer’s Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity. In Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 3245–3276, Mexico City, Mexico. Association for Computational Linguistics.