@inproceedings{du-etal-2025-disentangling,
title = "Disentangling the Roles of Representation and Selection in Data Pruning",
author = "Du, Yupei and
Song, Yingjin and
Wong, Hugh Mee and
Ignatev, Daniil and
Gatt, Albert and
Nguyen, Dong",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.821/",
doi = "10.18653/v1/2025.acl-long.821",
pages = "16791--16809",
ISBN = "979-8-89176-251-0",
abstract = "Data pruning{---}selecting small but impactful subsets{---}offers a promising way to efficiently scale NLP model training. However, existing methods often involve many different design choices, which have not been systematically studied. This limits future developments. In this work, we decompose data pruning into two key components: data representation and selection algorithm, and systematically analyze their influence on selected instances. Our theoretical and empirical results highlight the crucial role of representations: better representations, e.g., training gradients, generally lead to better selected instances, regardless of the chosen selection algorithm. Furthermore, different selection algorithms excel in different settings, and none consistently outperform the others. Moreover, the selection algorithms do not always align with their intended objectives: for example, algorithms designed for the same objective can select drastically different instances, highlighting the need for careful evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="du-etal-2025-disentangling">
<titleInfo>
<title>Disentangling the Roles of Representation and Selection in Data Pruning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yupei</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingjin</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hugh</namePart>
<namePart type="given">Mee</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniil</namePart>
<namePart type="family">Ignatev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Albert</namePart>
<namePart type="family">Gatt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dong</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Data pruning—selecting small but impactful subsets—offers a promising way to efficiently scale NLP model training. However, existing methods often involve many different design choices, which have not been systematically studied. This limits future developments. In this work, we decompose data pruning into two key components: data representation and selection algorithm, and systematically analyze their influence on selected instances. Our theoretical and empirical results highlight the crucial role of representations: better representations, e.g., training gradients, generally lead to better selected instances, regardless of the chosen selection algorithm. Furthermore, different selection algorithms excel in different settings, and none consistently outperform the others. Moreover, the selection algorithms do not always align with their intended objectives: for example, algorithms designed for the same objective can select drastically different instances, highlighting the need for careful evaluation.</abstract>
<identifier type="citekey">du-etal-2025-disentangling</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.821</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.821/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>16791</start>
<end>16809</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Disentangling the Roles of Representation and Selection in Data Pruning
%A Du, Yupei
%A Song, Yingjin
%A Wong, Hugh Mee
%A Ignatev, Daniil
%A Gatt, Albert
%A Nguyen, Dong
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F du-etal-2025-disentangling
%X Data pruning—selecting small but impactful subsets—offers a promising way to efficiently scale NLP model training. However, existing methods often involve many different design choices, which have not been systematically studied. This limits future developments. In this work, we decompose data pruning into two key components: data representation and selection algorithm, and systematically analyze their influence on selected instances. Our theoretical and empirical results highlight the crucial role of representations: better representations, e.g., training gradients, generally lead to better selected instances, regardless of the chosen selection algorithm. Furthermore, different selection algorithms excel in different settings, and none consistently outperform the others. Moreover, the selection algorithms do not always align with their intended objectives: for example, algorithms designed for the same objective can select drastically different instances, highlighting the need for careful evaluation.
%R 10.18653/v1/2025.acl-long.821
%U https://aclanthology.org/2025.acl-long.821/
%U https://doi.org/10.18653/v1/2025.acl-long.821
%P 16791-16809
Markdown (Informal)
[Disentangling the Roles of Representation and Selection in Data Pruning](https://aclanthology.org/2025.acl-long.821/) (Du et al., ACL 2025)
ACL
- Yupei Du, Yingjin Song, Hugh Mee Wong, Daniil Ignatev, Albert Gatt, and Dong Nguyen. 2025. Disentangling the Roles of Representation and Selection in Data Pruning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 16791–16809, Vienna, Austria. Association for Computational Linguistics.