@inproceedings{bai-etal-2025-efficient-pretraining,
title = "Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration",
author = "Bai, Tianyi and
Yang, Ling and
Wong, Zhen Hao and
Sun, Fupeng and
Zhuang, Xinlin and
Peng, Jiahui and
Zhang, Chi and
Wu, Lijun and
Jiantao, Qiu and
Zhang, Wentao and
Yuan, Binhang and
He, Conghui",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.466/",
doi = "10.18653/v1/2025.acl-long.466",
pages = "9465--9491",
ISBN = "979-8-89176-251-0",
abstract = "Efficient data selection is crucial to accelerate the pretraining of language model (LMs). While various methods have been proposed to enhance data efficiency, limited research has addressed the inherent conflicts between these approaches to achieve optimal data selection for LM pretraining. To tackle this problem, we propose a multi-actor collaborative data selection mechanism. Each data selection method independently prioritizes data based on its specific criterion and updates its prioritization rules using the current state of the model, functioning as an independent actor for data selection. Additionally, a console is designed to adjust the impacts of different actors at various stages and dynamically integrate information from all actors throughout the LM pretraining process. We conduct extensive empirical studies to evaluate our multi-actor framework. The experimental results demonstrate that our approach significantly improves data efficiency, accelerates convergence in LM pretraining, and achieves an average relative performance gain up to 10.5{\%} across multiple language model benchmarks compared to the state-of-the-art methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bai-etal-2025-efficient-pretraining">
<titleInfo>
<title>Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianyi</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ling</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="given">Hao</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fupeng</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinlin</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahui</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lijun</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiu</namePart>
<namePart type="family">Jiantao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wentao</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Binhang</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Conghui</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Efficient data selection is crucial to accelerate the pretraining of language model (LMs). While various methods have been proposed to enhance data efficiency, limited research has addressed the inherent conflicts between these approaches to achieve optimal data selection for LM pretraining. To tackle this problem, we propose a multi-actor collaborative data selection mechanism. Each data selection method independently prioritizes data based on its specific criterion and updates its prioritization rules using the current state of the model, functioning as an independent actor for data selection. Additionally, a console is designed to adjust the impacts of different actors at various stages and dynamically integrate information from all actors throughout the LM pretraining process. We conduct extensive empirical studies to evaluate our multi-actor framework. The experimental results demonstrate that our approach significantly improves data efficiency, accelerates convergence in LM pretraining, and achieves an average relative performance gain up to 10.5% across multiple language model benchmarks compared to the state-of-the-art methods.</abstract>
<identifier type="citekey">bai-etal-2025-efficient-pretraining</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.466</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.466/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>9465</start>
<end>9491</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration
%A Bai, Tianyi
%A Yang, Ling
%A Wong, Zhen Hao
%A Sun, Fupeng
%A Zhuang, Xinlin
%A Peng, Jiahui
%A Zhang, Chi
%A Wu, Lijun
%A Jiantao, Qiu
%A Zhang, Wentao
%A Yuan, Binhang
%A He, Conghui
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F bai-etal-2025-efficient-pretraining
%X Efficient data selection is crucial to accelerate the pretraining of language model (LMs). While various methods have been proposed to enhance data efficiency, limited research has addressed the inherent conflicts between these approaches to achieve optimal data selection for LM pretraining. To tackle this problem, we propose a multi-actor collaborative data selection mechanism. Each data selection method independently prioritizes data based on its specific criterion and updates its prioritization rules using the current state of the model, functioning as an independent actor for data selection. Additionally, a console is designed to adjust the impacts of different actors at various stages and dynamically integrate information from all actors throughout the LM pretraining process. We conduct extensive empirical studies to evaluate our multi-actor framework. The experimental results demonstrate that our approach significantly improves data efficiency, accelerates convergence in LM pretraining, and achieves an average relative performance gain up to 10.5% across multiple language model benchmarks compared to the state-of-the-art methods.
%R 10.18653/v1/2025.acl-long.466
%U https://aclanthology.org/2025.acl-long.466/
%U https://doi.org/10.18653/v1/2025.acl-long.466
%P 9465-9491
Markdown (Informal)
[Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration](https://aclanthology.org/2025.acl-long.466/) (Bai et al., ACL 2025)
ACL
- Tianyi Bai, Ling Yang, Zhen Hao Wong, Fupeng Sun, Xinlin Zhuang, Jiahui Peng, Chi Zhang, Lijun Wu, Qiu Jiantao, Wentao Zhang, Binhang Yuan, and Conghui He. 2025. Efficient Pretraining Data Selection for Language Models via Multi-Actor Collaboration. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 9465–9491, Vienna, Austria. Association for Computational Linguistics.