@inproceedings{kang-etal-2025-demystifying,
title = "Demystifying Synthetic Data in {LLM} Pre-training: A Systematic Study of Scaling Laws, Benefits, and Pitfalls",
author = "Kang, Feiyang and
Ardalani, Newsha and
Kuchnik, Michael and
Emad, Youssef and
Elhoushi, Mostafa and
Sengupta, Shubhabrata and
Li, Shang-Wen and
Raghavendra, Ramya and
Jia, Ruoxi and
Wu, Carole-Jean",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.544/",
pages = "10750--10769",
ISBN = "979-8-89176-332-6",
abstract = "Training data plays a crucial role in Large Language Models (LLM) scaling, yet high quality data is of limited supply. Synthetic data techniques offer a potential path toward sidestepping these limitations.We conduct a large-scale empirical investigation ({\ensuremath{>}}1000 LLMs with {\ensuremath{>}}100k GPU hours) using a unified protocol and scaling laws, comparing natural web data, diverse synthetic types (rephrased text, generated textbooks), and mixtures of natural and synthetic data. Specifically, we found pre-training on rephrased synthetic data $\textit{alone}$ is not faster than pre-training on natural web texts; while pre-training on 1/3 rephrased synthetic data mixed with 2/3 natural web texts can speed up 5-10x (to reach the same validation loss) at larger data budgets. Pre-training on textbook-style synthetic data $\textit{alone}$ results in notably higher loss on many downstream domains especially at small data budgets. ``Good'' ratios of synthetic data in training data mixtures depend on the model size and data budget, empirically converging to {\textasciitilde}30{\%} for rephrased synthetic data. Larger generator models do not necessarily yield better pre-training data than {\textasciitilde}8B-param models. These results contribute mixed evidence on ``model collapse'' during large-scale single-round (n=1) model training on synthetic data{--}training on rephrased synthetic data shows no degradation in performance in foreseeable scales whereas training on mixtures of textbook-style pure-generated synthetic data shows patterns predicted by ``model collapse''. Our work demystifies synthetic data in pre-training, validates its conditional benefits, and offers practical guidance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kang-etal-2025-demystifying">
<titleInfo>
<title>Demystifying Synthetic Data in LLM Pre-training: A Systematic Study of Scaling Laws, Benefits, and Pitfalls</title>
</titleInfo>
<name type="personal">
<namePart type="given">Feiyang</namePart>
<namePart type="family">Kang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Newsha</namePart>
<namePart type="family">Ardalani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Kuchnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youssef</namePart>
<namePart type="family">Emad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mostafa</namePart>
<namePart type="family">Elhoushi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shubhabrata</namePart>
<namePart type="family">Sengupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shang-Wen</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramya</namePart>
<namePart type="family">Raghavendra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruoxi</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carole-Jean</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Training data plays a crucial role in Large Language Models (LLM) scaling, yet high quality data is of limited supply. Synthetic data techniques offer a potential path toward sidestepping these limitations.We conduct a large-scale empirical investigation (\ensuremath>1000 LLMs with \ensuremath>100k GPU hours) using a unified protocol and scaling laws, comparing natural web data, diverse synthetic types (rephrased text, generated textbooks), and mixtures of natural and synthetic data. Specifically, we found pre-training on rephrased synthetic data alone is not faster than pre-training on natural web texts; while pre-training on 1/3 rephrased synthetic data mixed with 2/3 natural web texts can speed up 5-10x (to reach the same validation loss) at larger data budgets. Pre-training on textbook-style synthetic data alone results in notably higher loss on many downstream domains especially at small data budgets. “Good” ratios of synthetic data in training data mixtures depend on the model size and data budget, empirically converging to ~30% for rephrased synthetic data. Larger generator models do not necessarily yield better pre-training data than ~8B-param models. These results contribute mixed evidence on “model collapse” during large-scale single-round (n=1) model training on synthetic data–training on rephrased synthetic data shows no degradation in performance in foreseeable scales whereas training on mixtures of textbook-style pure-generated synthetic data shows patterns predicted by “model collapse”. Our work demystifies synthetic data in pre-training, validates its conditional benefits, and offers practical guidance.</abstract>
<identifier type="citekey">kang-etal-2025-demystifying</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.544/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>10750</start>
<end>10769</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Demystifying Synthetic Data in LLM Pre-training: A Systematic Study of Scaling Laws, Benefits, and Pitfalls
%A Kang, Feiyang
%A Ardalani, Newsha
%A Kuchnik, Michael
%A Emad, Youssef
%A Elhoushi, Mostafa
%A Sengupta, Shubhabrata
%A Li, Shang-Wen
%A Raghavendra, Ramya
%A Jia, Ruoxi
%A Wu, Carole-Jean
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F kang-etal-2025-demystifying
%X Training data plays a crucial role in Large Language Models (LLM) scaling, yet high quality data is of limited supply. Synthetic data techniques offer a potential path toward sidestepping these limitations.We conduct a large-scale empirical investigation (\ensuremath>1000 LLMs with \ensuremath>100k GPU hours) using a unified protocol and scaling laws, comparing natural web data, diverse synthetic types (rephrased text, generated textbooks), and mixtures of natural and synthetic data. Specifically, we found pre-training on rephrased synthetic data alone is not faster than pre-training on natural web texts; while pre-training on 1/3 rephrased synthetic data mixed with 2/3 natural web texts can speed up 5-10x (to reach the same validation loss) at larger data budgets. Pre-training on textbook-style synthetic data alone results in notably higher loss on many downstream domains especially at small data budgets. “Good” ratios of synthetic data in training data mixtures depend on the model size and data budget, empirically converging to ~30% for rephrased synthetic data. Larger generator models do not necessarily yield better pre-training data than ~8B-param models. These results contribute mixed evidence on “model collapse” during large-scale single-round (n=1) model training on synthetic data–training on rephrased synthetic data shows no degradation in performance in foreseeable scales whereas training on mixtures of textbook-style pure-generated synthetic data shows patterns predicted by “model collapse”. Our work demystifies synthetic data in pre-training, validates its conditional benefits, and offers practical guidance.
%U https://aclanthology.org/2025.emnlp-main.544/
%P 10750-10769
Markdown (Informal)
[Demystifying Synthetic Data in LLM Pre-training: A Systematic Study of Scaling Laws, Benefits, and Pitfalls](https://aclanthology.org/2025.emnlp-main.544/) (Kang et al., EMNLP 2025)
ACL
- Feiyang Kang, Newsha Ardalani, Michael Kuchnik, Youssef Emad, Mostafa Elhoushi, Shubhabrata Sengupta, Shang-Wen Li, Ramya Raghavendra, Ruoxi Jia, and Carole-Jean Wu. 2025. Demystifying Synthetic Data in LLM Pre-training: A Systematic Study of Scaling Laws, Benefits, and Pitfalls. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 10750–10769, Suzhou, China. Association for Computational Linguistics.