@inproceedings{liangyu-etal-2025-fire,
title = "{FIRE}: Flexible Integration of Data Quality Ratings for Effective Pretraining",
author = "Liangyu, Xu and
Zhang, Xuemiao and
Duan, Feiyu and
Wang, Sirui and
Weng, Rongxiang and
Wang, Jingang and
Cai, Xunliang",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.735/",
doi = "10.18653/v1/2025.emnlp-main.735",
pages = "14532--14552",
ISBN = "979-8-89176-332-6",
abstract = "Selecting high-quality data can improve the pretraining efficiency of large language models (LLMs). Existing methods generally rely on heuristic techniques or single quality signals, limiting their ability to evaluate data quality comprehensively. In this work, we propose FIRE, a flexible and scalable framework for integrating multiple data quality raters, which allows for a comprehensive assessment of data quality across various dimensions. FIRE aligns multiple quality signals into a unified space, and integrates diverse data quality raters to provide a comprehensive quality signal for each data point. Further, we introduce a progressive data selection scheme based on FIRE that iteratively refines the selection of high-quality data points. Extensive experiments show that FIRE outperforms other data selection methods and significantly boosts pretrained model performance across a wide range of downstream tasks, while requiring less than 37.5{\%} tokens needed by the $\textit{Random}$ baseline to reach the target performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liangyu-etal-2025-fire">
<titleInfo>
<title>FIRE: Flexible Integration of Data Quality Ratings for Effective Pretraining</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Liangyu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuemiao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feiyu</namePart>
<namePart type="family">Duan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sirui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rongxiang</namePart>
<namePart type="family">Weng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xunliang</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Selecting high-quality data can improve the pretraining efficiency of large language models (LLMs). Existing methods generally rely on heuristic techniques or single quality signals, limiting their ability to evaluate data quality comprehensively. In this work, we propose FIRE, a flexible and scalable framework for integrating multiple data quality raters, which allows for a comprehensive assessment of data quality across various dimensions. FIRE aligns multiple quality signals into a unified space, and integrates diverse data quality raters to provide a comprehensive quality signal for each data point. Further, we introduce a progressive data selection scheme based on FIRE that iteratively refines the selection of high-quality data points. Extensive experiments show that FIRE outperforms other data selection methods and significantly boosts pretrained model performance across a wide range of downstream tasks, while requiring less than 37.5% tokens needed by the Random baseline to reach the target performance.</abstract>
<identifier type="citekey">liangyu-etal-2025-fire</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.735</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.735/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>14532</start>
<end>14552</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FIRE: Flexible Integration of Data Quality Ratings for Effective Pretraining
%A Liangyu, Xu
%A Zhang, Xuemiao
%A Duan, Feiyu
%A Wang, Sirui
%A Weng, Rongxiang
%A Wang, Jingang
%A Cai, Xunliang
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F liangyu-etal-2025-fire
%X Selecting high-quality data can improve the pretraining efficiency of large language models (LLMs). Existing methods generally rely on heuristic techniques or single quality signals, limiting their ability to evaluate data quality comprehensively. In this work, we propose FIRE, a flexible and scalable framework for integrating multiple data quality raters, which allows for a comprehensive assessment of data quality across various dimensions. FIRE aligns multiple quality signals into a unified space, and integrates diverse data quality raters to provide a comprehensive quality signal for each data point. Further, we introduce a progressive data selection scheme based on FIRE that iteratively refines the selection of high-quality data points. Extensive experiments show that FIRE outperforms other data selection methods and significantly boosts pretrained model performance across a wide range of downstream tasks, while requiring less than 37.5% tokens needed by the Random baseline to reach the target performance.
%R 10.18653/v1/2025.emnlp-main.735
%U https://aclanthology.org/2025.emnlp-main.735/
%U https://doi.org/10.18653/v1/2025.emnlp-main.735
%P 14532-14552
Markdown (Informal)
[FIRE: Flexible Integration of Data Quality Ratings for Effective Pretraining](https://aclanthology.org/2025.emnlp-main.735/) (Liangyu et al., EMNLP 2025)
ACL