@inproceedings{ying-etal-2025-data,
title = "Data-Efficient Selection via Grammatical Complexity in Continual Pre-training of Domain-Specific {LLM}s",
author = "Ying, Yizhou and
Zhang, Geng and
Danxin, Cui and
Du, Chengyu and
Yue, Guanglei and
Jiang, Sihang and
Liang, Jiaqing and
Fu, Yifei and
Hu, Hailin and
Xiao, Yanghua",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1121/",
doi = "10.18653/v1/2025.emnlp-main.1121",
pages = "22055--22069",
ISBN = "979-8-89176-332-6",
abstract = "Data efficiency is crucial in domain-specific continual pre-training (CPT) of large language models (LLMs), especially under resource constraints. Aiming for ``small data, big impact,'' this work addresses the limitations of existing domain-specific data selection strategies, which often rely on scarce labeled data or computationally expensive LLMs. We introduce CDF Sampling with Grammatical Complexity (CDF-GC), an annotation-independent, efficient and interpretable data selection framework for CPT. Our approach comprehensively evaluates grammatical complexity using lexical diversity and syntactic complexity, and employs a cumulative distribution function (CDF)-based sampling strategy to balance complexity and diversity. To validate the effectiveness of CDF-GC, we conducted experiments on a financial dataset. The results demonstrate that CDF-GC significantly outperforms baselines, achieving 2.0{\%} improvement in financial QA at the same selection ratio and even surpassing full-data training by 1.7{\%} using only 20{\%} of the data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ying-etal-2025-data">
<titleInfo>
<title>Data-Efficient Selection via Grammatical Complexity in Continual Pre-training of Domain-Specific LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yizhou</namePart>
<namePart type="family">Ying</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Geng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cui</namePart>
<namePart type="family">Danxin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengyu</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanglei</namePart>
<namePart type="family">Yue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sihang</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaqing</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifei</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hailin</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanghua</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Data efficiency is crucial in domain-specific continual pre-training (CPT) of large language models (LLMs), especially under resource constraints. Aiming for “small data, big impact,” this work addresses the limitations of existing domain-specific data selection strategies, which often rely on scarce labeled data or computationally expensive LLMs. We introduce CDF Sampling with Grammatical Complexity (CDF-GC), an annotation-independent, efficient and interpretable data selection framework for CPT. Our approach comprehensively evaluates grammatical complexity using lexical diversity and syntactic complexity, and employs a cumulative distribution function (CDF)-based sampling strategy to balance complexity and diversity. To validate the effectiveness of CDF-GC, we conducted experiments on a financial dataset. The results demonstrate that CDF-GC significantly outperforms baselines, achieving 2.0% improvement in financial QA at the same selection ratio and even surpassing full-data training by 1.7% using only 20% of the data.</abstract>
<identifier type="citekey">ying-etal-2025-data</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1121</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1121/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>22055</start>
<end>22069</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data-Efficient Selection via Grammatical Complexity in Continual Pre-training of Domain-Specific LLMs
%A Ying, Yizhou
%A Zhang, Geng
%A Danxin, Cui
%A Du, Chengyu
%A Yue, Guanglei
%A Jiang, Sihang
%A Liang, Jiaqing
%A Fu, Yifei
%A Hu, Hailin
%A Xiao, Yanghua
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F ying-etal-2025-data
%X Data efficiency is crucial in domain-specific continual pre-training (CPT) of large language models (LLMs), especially under resource constraints. Aiming for “small data, big impact,” this work addresses the limitations of existing domain-specific data selection strategies, which often rely on scarce labeled data or computationally expensive LLMs. We introduce CDF Sampling with Grammatical Complexity (CDF-GC), an annotation-independent, efficient and interpretable data selection framework for CPT. Our approach comprehensively evaluates grammatical complexity using lexical diversity and syntactic complexity, and employs a cumulative distribution function (CDF)-based sampling strategy to balance complexity and diversity. To validate the effectiveness of CDF-GC, we conducted experiments on a financial dataset. The results demonstrate that CDF-GC significantly outperforms baselines, achieving 2.0% improvement in financial QA at the same selection ratio and even surpassing full-data training by 1.7% using only 20% of the data.
%R 10.18653/v1/2025.emnlp-main.1121
%U https://aclanthology.org/2025.emnlp-main.1121/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1121
%P 22055-22069
Markdown (Informal)
[Data-Efficient Selection via Grammatical Complexity in Continual Pre-training of Domain-Specific LLMs](https://aclanthology.org/2025.emnlp-main.1121/) (Ying et al., EMNLP 2025)
ACL
- Yizhou Ying, Geng Zhang, Cui Danxin, Chengyu Du, Guanglei Yue, Sihang Jiang, Jiaqing Liang, Yifei Fu, Hailin Hu, and Yanghua Xiao. 2025. Data-Efficient Selection via Grammatical Complexity in Continual Pre-training of Domain-Specific LLMs. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 22055–22069, Suzhou, China. Association for Computational Linguistics.