@inproceedings{xu-etal-2025-structure,
title = "Structure Trumps Size: Rethinking Data Quality for {LLM} Reasoning",
author = "Xu, Hu and
Li, Zeyan and
Wang, Rui and
Xu, Jianfeng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.616/",
pages = "11489--11513",
ISBN = "979-8-89176-335-7",
abstract = "As domain-specific datasets continue to expand, Large Language Models (LLMs) have achieved significant improvements across various fields through supervised fine-tuning (SFT). However, is more data always better for model fine-tuning? Through a series of controlled experiments, we discover that dataset structure{---}rather than mere size{---}plays a decisive role in enhancing LLM reasoning capabilities. While existing methods acknowledge that good data quality can make training more efficient, they primarily rely on simple heuristic strategies and lack systematic, quantitative frameworks for evaluating data quality. To address this gap, we introduce MCSQ{---}the first multi-dimensional quantitative framework for reasoning data management. MCSQ rigorously evaluates and optimizes datasets along six orthogonal dimensions. Through comprehensive controlled experiments, we find that selectively incorporating ``distorted'' (model-disagreed) or ``mismatched'' (low-relevance) samples{---}which are typically discarded in traditional approaches{---}can outperform conventional ``clean'' data on certain advanced reasoning benchmarks. Our findings challenge traditional assumptions about data ``quality'' in LLM fine-tuning and provide actionable, quantitative guidance for efficient, structure-aware dataset management. The datasets and codes are both available at https://github.com/xuhu0115/MCSQ."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2025-structure">
<titleInfo>
<title>Structure Trumps Size: Rethinking Data Quality for LLM Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hu</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeyan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianfeng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>As domain-specific datasets continue to expand, Large Language Models (LLMs) have achieved significant improvements across various fields through supervised fine-tuning (SFT). However, is more data always better for model fine-tuning? Through a series of controlled experiments, we discover that dataset structure—rather than mere size—plays a decisive role in enhancing LLM reasoning capabilities. While existing methods acknowledge that good data quality can make training more efficient, they primarily rely on simple heuristic strategies and lack systematic, quantitative frameworks for evaluating data quality. To address this gap, we introduce MCSQ—the first multi-dimensional quantitative framework for reasoning data management. MCSQ rigorously evaluates and optimizes datasets along six orthogonal dimensions. Through comprehensive controlled experiments, we find that selectively incorporating “distorted” (model-disagreed) or “mismatched” (low-relevance) samples—which are typically discarded in traditional approaches—can outperform conventional “clean” data on certain advanced reasoning benchmarks. Our findings challenge traditional assumptions about data “quality” in LLM fine-tuning and provide actionable, quantitative guidance for efficient, structure-aware dataset management. The datasets and codes are both available at https://github.com/xuhu0115/MCSQ.</abstract>
<identifier type="citekey">xu-etal-2025-structure</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.616/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>11489</start>
<end>11513</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Structure Trumps Size: Rethinking Data Quality for LLM Reasoning
%A Xu, Hu
%A Li, Zeyan
%A Wang, Rui
%A Xu, Jianfeng
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F xu-etal-2025-structure
%X As domain-specific datasets continue to expand, Large Language Models (LLMs) have achieved significant improvements across various fields through supervised fine-tuning (SFT). However, is more data always better for model fine-tuning? Through a series of controlled experiments, we discover that dataset structure—rather than mere size—plays a decisive role in enhancing LLM reasoning capabilities. While existing methods acknowledge that good data quality can make training more efficient, they primarily rely on simple heuristic strategies and lack systematic, quantitative frameworks for evaluating data quality. To address this gap, we introduce MCSQ—the first multi-dimensional quantitative framework for reasoning data management. MCSQ rigorously evaluates and optimizes datasets along six orthogonal dimensions. Through comprehensive controlled experiments, we find that selectively incorporating “distorted” (model-disagreed) or “mismatched” (low-relevance) samples—which are typically discarded in traditional approaches—can outperform conventional “clean” data on certain advanced reasoning benchmarks. Our findings challenge traditional assumptions about data “quality” in LLM fine-tuning and provide actionable, quantitative guidance for efficient, structure-aware dataset management. The datasets and codes are both available at https://github.com/xuhu0115/MCSQ.
%U https://aclanthology.org/2025.findings-emnlp.616/
%P 11489-11513
Markdown (Informal)
[Structure Trumps Size: Rethinking Data Quality for LLM Reasoning](https://aclanthology.org/2025.findings-emnlp.616/) (Xu et al., Findings 2025)
ACL