@inproceedings{hanafi-etal-2025-identifying,
title = "Identifying Noise in Human-Created Datasets using Training Dynamics from Generative Models",
author = "Hanafi, Maeda and
Jindal, Ishan and
Katsis, Yannis and
Popa, Lucian and
Zhu, Huaiyu",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.840/",
doi = "10.18653/v1/2025.findings-emnlp.840",
pages = "15534--15550",
ISBN = "979-8-89176-335-7",
abstract = "Instruction fine-tuning enhances the alignment of autoregressive language models (ArLMs) with human intent but relies on large-scale annotated datasets prone to label and text noise. In this paper, we show that existing noise detection techniques designed for autoencoder models (AeLMs) do not directly generalize to ArLMs due to differences in learning dynamics. We propose TDRanker, a novel approach leveraging training dynamics to rank datapoints from easy-to-learn to hard-to-learn, effectively identifying noisy instances. Our method demonstrates robustness across multiple model architectures covering both autoencoder and autoregressive language models (GPT-2, BERT, LaMini-Cerebras-256M) and across various dataset noise levels, achieving at least 2x faster denoising than previous techniques. Applied to real-world classification and generative tasks, TDRanker significantly improves data quality and model performance. These findings suggest that TDRanker provides a scalable solution for refining instruction-tuning datasets, enhancing the reliability of fine-tuned ArLMs in practical applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hanafi-etal-2025-identifying">
<titleInfo>
<title>Identifying Noise in Human-Created Datasets using Training Dynamics from Generative Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maeda</namePart>
<namePart type="family">Hanafi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ishan</namePart>
<namePart type="family">Jindal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yannis</namePart>
<namePart type="family">Katsis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huaiyu</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Instruction fine-tuning enhances the alignment of autoregressive language models (ArLMs) with human intent but relies on large-scale annotated datasets prone to label and text noise. In this paper, we show that existing noise detection techniques designed for autoencoder models (AeLMs) do not directly generalize to ArLMs due to differences in learning dynamics. We propose TDRanker, a novel approach leveraging training dynamics to rank datapoints from easy-to-learn to hard-to-learn, effectively identifying noisy instances. Our method demonstrates robustness across multiple model architectures covering both autoencoder and autoregressive language models (GPT-2, BERT, LaMini-Cerebras-256M) and across various dataset noise levels, achieving at least 2x faster denoising than previous techniques. Applied to real-world classification and generative tasks, TDRanker significantly improves data quality and model performance. These findings suggest that TDRanker provides a scalable solution for refining instruction-tuning datasets, enhancing the reliability of fine-tuned ArLMs in practical applications.</abstract>
<identifier type="citekey">hanafi-etal-2025-identifying</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.840</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.840/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>15534</start>
<end>15550</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Identifying Noise in Human-Created Datasets using Training Dynamics from Generative Models
%A Hanafi, Maeda
%A Jindal, Ishan
%A Katsis, Yannis
%A Popa, Lucian
%A Zhu, Huaiyu
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F hanafi-etal-2025-identifying
%X Instruction fine-tuning enhances the alignment of autoregressive language models (ArLMs) with human intent but relies on large-scale annotated datasets prone to label and text noise. In this paper, we show that existing noise detection techniques designed for autoencoder models (AeLMs) do not directly generalize to ArLMs due to differences in learning dynamics. We propose TDRanker, a novel approach leveraging training dynamics to rank datapoints from easy-to-learn to hard-to-learn, effectively identifying noisy instances. Our method demonstrates robustness across multiple model architectures covering both autoencoder and autoregressive language models (GPT-2, BERT, LaMini-Cerebras-256M) and across various dataset noise levels, achieving at least 2x faster denoising than previous techniques. Applied to real-world classification and generative tasks, TDRanker significantly improves data quality and model performance. These findings suggest that TDRanker provides a scalable solution for refining instruction-tuning datasets, enhancing the reliability of fine-tuned ArLMs in practical applications.
%R 10.18653/v1/2025.findings-emnlp.840
%U https://aclanthology.org/2025.findings-emnlp.840/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.840
%P 15534-15550
Markdown (Informal)
[Identifying Noise in Human-Created Datasets using Training Dynamics from Generative Models](https://aclanthology.org/2025.findings-emnlp.840/) (Hanafi et al., Findings 2025)
ACL