@inproceedings{yuan-etal-2026-learning,
title = "Learning on Imbalanced Noisy Data via Debiased Sample Selection and {LLM}-Driven Annotation",
author = "Yuan, Bo and
Chen, Yulin and
Zhang, Yin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1526/",
pages = "30504--30542",
ISBN = "979-8-89176-395-1",
abstract = "Learning with Noisy Labels (LNL) is a challenge where the collected training set can contain incorrect or corrupted labels. Most existing solutions distinguish clean samples from noisy samples and query human experts on noisy samples for denoising. However, these solutions often operate under the unrealistic assumption that the distribution of classes is uniform, overlooking the skewed and imbalanced distributions frequently encountered in real-world scenarios. In this case, we empirically reveal that previous solutions suffer from both selection bias and training bias, leading to distinguish clean samples from noisy samples hardly. In this paper, our work introduces the imbalanced learning with noisy labels (i-LNL) task, which seeks to let the model learn from noisy labels within imbalanced distributions. A new benchmark (ImbaLNL-Bench) comprised of some synthetic and real-world datasets is created to provide a thorough representation of practical use cases. Besides, we propose an innovative collaborative learning framework DeCo for i-LNL tasks. Specifically, we first conduct debiased sample selection, consisting of a robust expert model and a debiased-enhanced threshold strategy, to better separate clean samples from noisy samples, especially for the tail classes. Then we feed selected clean samples to active annotator large language models (LLMs) for re-annotating noisy samples using in-context learning, which can better reduce human effort. Ultimately, we employ distinct loss functions adept at managing subsets with varying degrees of label noise. Extensive experimental results on synthetic and real-world datasets show the effectiveness and superiority of our method."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2026-learning">
<titleInfo>
<title>Learning on Imbalanced Noisy Data via Debiased Sample Selection and LLM-Driven Annotation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Learning with Noisy Labels (LNL) is a challenge where the collected training set can contain incorrect or corrupted labels. Most existing solutions distinguish clean samples from noisy samples and query human experts on noisy samples for denoising. However, these solutions often operate under the unrealistic assumption that the distribution of classes is uniform, overlooking the skewed and imbalanced distributions frequently encountered in real-world scenarios. In this case, we empirically reveal that previous solutions suffer from both selection bias and training bias, leading to distinguish clean samples from noisy samples hardly. In this paper, our work introduces the imbalanced learning with noisy labels (i-LNL) task, which seeks to let the model learn from noisy labels within imbalanced distributions. A new benchmark (ImbaLNL-Bench) comprised of some synthetic and real-world datasets is created to provide a thorough representation of practical use cases. Besides, we propose an innovative collaborative learning framework DeCo for i-LNL tasks. Specifically, we first conduct debiased sample selection, consisting of a robust expert model and a debiased-enhanced threshold strategy, to better separate clean samples from noisy samples, especially for the tail classes. Then we feed selected clean samples to active annotator large language models (LLMs) for re-annotating noisy samples using in-context learning, which can better reduce human effort. Ultimately, we employ distinct loss functions adept at managing subsets with varying degrees of label noise. Extensive experimental results on synthetic and real-world datasets show the effectiveness and superiority of our method.</abstract>
<identifier type="citekey">yuan-etal-2026-learning</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1526/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30504</start>
<end>30542</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning on Imbalanced Noisy Data via Debiased Sample Selection and LLM-Driven Annotation
%A Yuan, Bo
%A Chen, Yulin
%A Zhang, Yin
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yuan-etal-2026-learning
%X Learning with Noisy Labels (LNL) is a challenge where the collected training set can contain incorrect or corrupted labels. Most existing solutions distinguish clean samples from noisy samples and query human experts on noisy samples for denoising. However, these solutions often operate under the unrealistic assumption that the distribution of classes is uniform, overlooking the skewed and imbalanced distributions frequently encountered in real-world scenarios. In this case, we empirically reveal that previous solutions suffer from both selection bias and training bias, leading to distinguish clean samples from noisy samples hardly. In this paper, our work introduces the imbalanced learning with noisy labels (i-LNL) task, which seeks to let the model learn from noisy labels within imbalanced distributions. A new benchmark (ImbaLNL-Bench) comprised of some synthetic and real-world datasets is created to provide a thorough representation of practical use cases. Besides, we propose an innovative collaborative learning framework DeCo for i-LNL tasks. Specifically, we first conduct debiased sample selection, consisting of a robust expert model and a debiased-enhanced threshold strategy, to better separate clean samples from noisy samples, especially for the tail classes. Then we feed selected clean samples to active annotator large language models (LLMs) for re-annotating noisy samples using in-context learning, which can better reduce human effort. Ultimately, we employ distinct loss functions adept at managing subsets with varying degrees of label noise. Extensive experimental results on synthetic and real-world datasets show the effectiveness and superiority of our method.
%U https://aclanthology.org/2026.findings-acl.1526/
%P 30504-30542
Markdown (Informal)
[Learning on Imbalanced Noisy Data via Debiased Sample Selection and LLM-Driven Annotation](https://aclanthology.org/2026.findings-acl.1526/) (Yuan et al., Findings 2026)
ACL