@inproceedings{ye-etal-2025-scalable,
title = "Scalable Data Synthesis through Human-like Cognitive Imitation and Data Recombination",
author = "Ye, Zhongyi and
Zhang, Weitai and
Zhou, Xinyuan and
Zhu, Yongxin and
Rao, Ninghui and
Chen, Enhong",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.236/",
doi = "10.18653/v1/2025.emnlp-main.236",
pages = "4721--4735",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) rely on massive amounts of training data, however, the quantity of empirically observed data is limited. To alleviate this issue, lots of LLMs leverage synthetic data to enhance the quantity of training data. Despite significant advancements in LLMs, the efficiency and scalability characteristics of data synthesis during pre-training phases remain insufficiently explored. In this work, we propose a novel data synthesis framework, Cognitive Combination Synthesis (CCS), designed to achieve highly efficient and scalable data synthesis. Specifically, our methodology mimics human cognitive behaviors by recombining and interconnecting heterogeneous data from diverse sources thereby enhancing advanced reasoning capabilities in LLMs. Extensive experiments demonstrate that: (1) effective data organization is essential, and our mapping-based combination learning approach significantly improves data utilization efficiency; (2) by enhancing data diversity, accuracy, and complexity, our synthetic data scales beyond 100B tokens, revealing CCS{'}s strong scalability. Our findings highlight the impact of data organization methods on LLM learning efficiency and the significant potential of scalable synthetic data to enhance model reasoning capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ye-etal-2025-scalable">
<titleInfo>
<title>Scalable Data Synthesis through Human-like Cognitive Imitation and Data Recombination</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhongyi</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weitai</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyuan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongxin</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ninghui</namePart>
<namePart type="family">Rao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enhong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) rely on massive amounts of training data, however, the quantity of empirically observed data is limited. To alleviate this issue, lots of LLMs leverage synthetic data to enhance the quantity of training data. Despite significant advancements in LLMs, the efficiency and scalability characteristics of data synthesis during pre-training phases remain insufficiently explored. In this work, we propose a novel data synthesis framework, Cognitive Combination Synthesis (CCS), designed to achieve highly efficient and scalable data synthesis. Specifically, our methodology mimics human cognitive behaviors by recombining and interconnecting heterogeneous data from diverse sources thereby enhancing advanced reasoning capabilities in LLMs. Extensive experiments demonstrate that: (1) effective data organization is essential, and our mapping-based combination learning approach significantly improves data utilization efficiency; (2) by enhancing data diversity, accuracy, and complexity, our synthetic data scales beyond 100B tokens, revealing CCS’s strong scalability. Our findings highlight the impact of data organization methods on LLM learning efficiency and the significant potential of scalable synthetic data to enhance model reasoning capabilities.</abstract>
<identifier type="citekey">ye-etal-2025-scalable</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.236</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.236/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>4721</start>
<end>4735</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scalable Data Synthesis through Human-like Cognitive Imitation and Data Recombination
%A Ye, Zhongyi
%A Zhang, Weitai
%A Zhou, Xinyuan
%A Zhu, Yongxin
%A Rao, Ninghui
%A Chen, Enhong
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F ye-etal-2025-scalable
%X Large language models (LLMs) rely on massive amounts of training data, however, the quantity of empirically observed data is limited. To alleviate this issue, lots of LLMs leverage synthetic data to enhance the quantity of training data. Despite significant advancements in LLMs, the efficiency and scalability characteristics of data synthesis during pre-training phases remain insufficiently explored. In this work, we propose a novel data synthesis framework, Cognitive Combination Synthesis (CCS), designed to achieve highly efficient and scalable data synthesis. Specifically, our methodology mimics human cognitive behaviors by recombining and interconnecting heterogeneous data from diverse sources thereby enhancing advanced reasoning capabilities in LLMs. Extensive experiments demonstrate that: (1) effective data organization is essential, and our mapping-based combination learning approach significantly improves data utilization efficiency; (2) by enhancing data diversity, accuracy, and complexity, our synthetic data scales beyond 100B tokens, revealing CCS’s strong scalability. Our findings highlight the impact of data organization methods on LLM learning efficiency and the significant potential of scalable synthetic data to enhance model reasoning capabilities.
%R 10.18653/v1/2025.emnlp-main.236
%U https://aclanthology.org/2025.emnlp-main.236/
%U https://doi.org/10.18653/v1/2025.emnlp-main.236
%P 4721-4735
Markdown (Informal)
[Scalable Data Synthesis through Human-like Cognitive Imitation and Data Recombination](https://aclanthology.org/2025.emnlp-main.236/) (Ye et al., EMNLP 2025)
ACL