@inproceedings{he-etal-2025-fine,
title = "{F}i{NE}: Filtering and Improving Noisy Data Elaborately with Large Language Models",
author = "He, Junliang and
Fan, Ziyue and
Kuang, Shaohui and
Xiaoqing, Li and
Song, Kai and
Zhou, Yaqian and
Qiu, Xipeng",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.437/",
doi = "10.18653/v1/2025.naacl-long.437",
pages = "8686--8707",
ISBN = "979-8-89176-189-6",
abstract = "Data is the lifeblood of large language models (LLMs). While the quantity of open-source data available for training LLMs is substantial, its integrity often falls short. For instance, the open-source chat version of Yi-1.5-9B scores 5.20 on AlignBench, while the Chinese Alpaca-GPT4 version scores 4.12. This discrepancy makes it challenging for developers to create models that excel in downstream tasks and instruction following. Therefore, it is essential to improve data integrity. Currently, there are two mainstream methods for enhancing data integrity: data filtering and data augmentation. Due to the labor-intensive and time-consuming nature of performing these tasks manually, some of these efforts are now being undertaken by LLMs, owing to their high alignment with human preferences. However, we have found that performing data filtering or data augmentation with LLMs has limited effectiveness in improving data integrity. In this work, we propose FiNE (\textbf{F}iltering and \textbf{I}mproving \textbf{N}oisy data \textbf{E}laborately), a method that performs refined filtering and improvement of training data with LLMs. Using the data obtained through our method to train Yi-1.5-9B, the performance gap on AlignBench between our model and the open-source chat version is reduced from 1.08 to 0.35. Additionally, on HalluQA, our model surpasses the open-source chat version by 8.45."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="he-etal-2025-fine">
<titleInfo>
<title>FiNE: Filtering and Improving Noisy Data Elaborately with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junliang</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyue</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaohui</namePart>
<namePart type="family">Kuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Xiaoqing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaqian</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xipeng</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Data is the lifeblood of large language models (LLMs). While the quantity of open-source data available for training LLMs is substantial, its integrity often falls short. For instance, the open-source chat version of Yi-1.5-9B scores 5.20 on AlignBench, while the Chinese Alpaca-GPT4 version scores 4.12. This discrepancy makes it challenging for developers to create models that excel in downstream tasks and instruction following. Therefore, it is essential to improve data integrity. Currently, there are two mainstream methods for enhancing data integrity: data filtering and data augmentation. Due to the labor-intensive and time-consuming nature of performing these tasks manually, some of these efforts are now being undertaken by LLMs, owing to their high alignment with human preferences. However, we have found that performing data filtering or data augmentation with LLMs has limited effectiveness in improving data integrity. In this work, we propose FiNE (Filtering and Improving Noisy data Elaborately), a method that performs refined filtering and improvement of training data with LLMs. Using the data obtained through our method to train Yi-1.5-9B, the performance gap on AlignBench between our model and the open-source chat version is reduced from 1.08 to 0.35. Additionally, on HalluQA, our model surpasses the open-source chat version by 8.45.</abstract>
<identifier type="citekey">he-etal-2025-fine</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.437</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.437/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>8686</start>
<end>8707</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FiNE: Filtering and Improving Noisy Data Elaborately with Large Language Models
%A He, Junliang
%A Fan, Ziyue
%A Kuang, Shaohui
%A Xiaoqing, Li
%A Song, Kai
%A Zhou, Yaqian
%A Qiu, Xipeng
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F he-etal-2025-fine
%X Data is the lifeblood of large language models (LLMs). While the quantity of open-source data available for training LLMs is substantial, its integrity often falls short. For instance, the open-source chat version of Yi-1.5-9B scores 5.20 on AlignBench, while the Chinese Alpaca-GPT4 version scores 4.12. This discrepancy makes it challenging for developers to create models that excel in downstream tasks and instruction following. Therefore, it is essential to improve data integrity. Currently, there are two mainstream methods for enhancing data integrity: data filtering and data augmentation. Due to the labor-intensive and time-consuming nature of performing these tasks manually, some of these efforts are now being undertaken by LLMs, owing to their high alignment with human preferences. However, we have found that performing data filtering or data augmentation with LLMs has limited effectiveness in improving data integrity. In this work, we propose FiNE (Filtering and Improving Noisy data Elaborately), a method that performs refined filtering and improvement of training data with LLMs. Using the data obtained through our method to train Yi-1.5-9B, the performance gap on AlignBench between our model and the open-source chat version is reduced from 1.08 to 0.35. Additionally, on HalluQA, our model surpasses the open-source chat version by 8.45.
%R 10.18653/v1/2025.naacl-long.437
%U https://aclanthology.org/2025.naacl-long.437/
%U https://doi.org/10.18653/v1/2025.naacl-long.437
%P 8686-8707
Markdown (Informal)
[FiNE: Filtering and Improving Noisy Data Elaborately with Large Language Models](https://aclanthology.org/2025.naacl-long.437/) (He et al., NAACL 2025)
ACL
- Junliang He, Ziyue Fan, Shaohui Kuang, Li Xiaoqing, Kai Song, Yaqian Zhou, and Xipeng Qiu. 2025. FiNE: Filtering and Improving Noisy Data Elaborately with Large Language Models. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 8686–8707, Albuquerque, New Mexico. Association for Computational Linguistics.