@inproceedings{schaffelder-gatt-2026-synthetic,
title = "Synthetic Eggs in Many Baskets: The Impact of Synthetic Data Diversity on {LLM} Fine-Tuning",
author = "Schaffelder, Max and
Gatt, Albert",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.360/",
pages = "7265--7293",
ISBN = "979-8-89176-395-1",
abstract = "As synthetic data becomes widely used in language model development, understanding its impact on model behavior is crucial. This paper investigates the impact of the diversity of sources of synthetic data on fine-tuned large language models. We focus on three key dimensions: distribution collapse, adversarial robustness, and self-preference bias. Our findings reveal that fine-tuning models on synthetic data from diverse sources can mitigate distribution collapse, preserving the breadth of the output distribution and the diversity of the output text. Furthermore, while both human and synthetic fine-tuning data can remove safeguards, we observe a tendency for higher output quality in the latter case, thus making outputs potentially more usable and dangerous. Finally, we also find evidence that fine-tuning reduces self-preference bias, with human data being the most effective, followed by multi-source synthetic data. All code is available at https://github.com/maxschaffelder/synthetic{\_}data{\_}diversity."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schaffelder-gatt-2026-synthetic">
<titleInfo>
<title>Synthetic Eggs in Many Baskets: The Impact of Synthetic Data Diversity on LLM Fine-Tuning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Schaffelder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>As synthetic data becomes widely used in language model development, understanding its impact on model behavior is crucial. This paper investigates the impact of the diversity of sources of synthetic data on fine-tuned large language models. We focus on three key dimensions: distribution collapse, adversarial robustness, and self-preference bias. Our findings reveal that fine-tuning models on synthetic data from diverse sources can mitigate distribution collapse, preserving the breadth of the output distribution and the diversity of the output text. Furthermore, while both human and synthetic fine-tuning data can remove safeguards, we observe a tendency for higher output quality in the latter case, thus making outputs potentially more usable and dangerous. Finally, we also find evidence that fine-tuning reduces self-preference bias, with human data being the most effective, followed by multi-source synthetic data. All code is available at https://github.com/maxschaffelder/synthetic_data_diversity.</abstract>
<identifier type="citekey">schaffelder-gatt-2026-synthetic</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.360/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>7265</start>
<end>7293</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic Eggs in Many Baskets: The Impact of Synthetic Data Diversity on LLM Fine-Tuning
%A Schaffelder, Max
%A Gatt, Albert
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F schaffelder-gatt-2026-synthetic
%X As synthetic data becomes widely used in language model development, understanding its impact on model behavior is crucial. This paper investigates the impact of the diversity of sources of synthetic data on fine-tuned large language models. We focus on three key dimensions: distribution collapse, adversarial robustness, and self-preference bias. Our findings reveal that fine-tuning models on synthetic data from diverse sources can mitigate distribution collapse, preserving the breadth of the output distribution and the diversity of the output text. Furthermore, while both human and synthetic fine-tuning data can remove safeguards, we observe a tendency for higher output quality in the latter case, thus making outputs potentially more usable and dangerous. Finally, we also find evidence that fine-tuning reduces self-preference bias, with human data being the most effective, followed by multi-source synthetic data. All code is available at https://github.com/maxschaffelder/synthetic_data_diversity.
%U https://aclanthology.org/2026.findings-acl.360/
%P 7265-7293
Markdown (Informal)
[Synthetic Eggs in Many Baskets: The Impact of Synthetic Data Diversity on LLM Fine-Tuning](https://aclanthology.org/2026.findings-acl.360/) (Schaffelder & Gatt, Findings 2026)
ACL