@inproceedings{chimoto-etal-2024-critical,
title = "Critical Learning Periods: Leveraging Early Training Dynamics for Efficient Data Pruning",
author = "Chimoto, Everlyn and
Gala, Jay and
Ahia, Orevaoghene and
Kreutzer, Julia and
Bassett, Bruce and
Hooker, Sara",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.560",
doi = "10.18653/v1/2024.findings-acl.560",
pages = "9407--9426",
abstract = "Neural Machine Translation models are extremely data and compute-hungry. However, not all datapoints contribute equally to model training and generalization. Data pruning to remove the low-value data points has the benefit of drastically reducing the compute budget without significantdrop in model performance. In this paper, we propose a new data pruning technique: CheckpointsAcross Time (CAT ), that leverages early model training dynamics to identify the most relevantdata points for model performance. We benchmark CAT against several data pruning techniquesincluding COMET-QE, LASER and LaBSE. We find that CAT outperforms the benchmarks onIndo-European languages on multiple test sets. When applied to English-German, English-Frenchand English-Swahili translation tasks, CAT achieves comparable performance to using the fulldataset, while pruning up to 50{\%} of training data. We inspect the data points that CAT selectsand find that it tends to favour longer sentences and sentences with unique or rare words.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chimoto-etal-2024-critical">
<titleInfo>
<title>Critical Learning Periods: Leveraging Early Training Dynamics for Efficient Data Pruning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Everlyn</namePart>
<namePart type="family">Chimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Gala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Orevaoghene</namePart>
<namePart type="family">Ahia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Kreutzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bruce</namePart>
<namePart type="family">Bassett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural Machine Translation models are extremely data and compute-hungry. However, not all datapoints contribute equally to model training and generalization. Data pruning to remove the low-value data points has the benefit of drastically reducing the compute budget without significantdrop in model performance. In this paper, we propose a new data pruning technique: CheckpointsAcross Time (CAT ), that leverages early model training dynamics to identify the most relevantdata points for model performance. We benchmark CAT against several data pruning techniquesincluding COMET-QE, LASER and LaBSE. We find that CAT outperforms the benchmarks onIndo-European languages on multiple test sets. When applied to English-German, English-Frenchand English-Swahili translation tasks, CAT achieves comparable performance to using the fulldataset, while pruning up to 50% of training data. We inspect the data points that CAT selectsand find that it tends to favour longer sentences and sentences with unique or rare words.</abstract>
<identifier type="citekey">chimoto-etal-2024-critical</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.560</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.560</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>9407</start>
<end>9426</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Critical Learning Periods: Leveraging Early Training Dynamics for Efficient Data Pruning
%A Chimoto, Everlyn
%A Gala, Jay
%A Ahia, Orevaoghene
%A Kreutzer, Julia
%A Bassett, Bruce
%A Hooker, Sara
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F chimoto-etal-2024-critical
%X Neural Machine Translation models are extremely data and compute-hungry. However, not all datapoints contribute equally to model training and generalization. Data pruning to remove the low-value data points has the benefit of drastically reducing the compute budget without significantdrop in model performance. In this paper, we propose a new data pruning technique: CheckpointsAcross Time (CAT ), that leverages early model training dynamics to identify the most relevantdata points for model performance. We benchmark CAT against several data pruning techniquesincluding COMET-QE, LASER and LaBSE. We find that CAT outperforms the benchmarks onIndo-European languages on multiple test sets. When applied to English-German, English-Frenchand English-Swahili translation tasks, CAT achieves comparable performance to using the fulldataset, while pruning up to 50% of training data. We inspect the data points that CAT selectsand find that it tends to favour longer sentences and sentences with unique or rare words.
%R 10.18653/v1/2024.findings-acl.560
%U https://aclanthology.org/2024.findings-acl.560
%U https://doi.org/10.18653/v1/2024.findings-acl.560
%P 9407-9426
Markdown (Informal)
[Critical Learning Periods: Leveraging Early Training Dynamics for Efficient Data Pruning](https://aclanthology.org/2024.findings-acl.560) (Chimoto et al., Findings 2024)
ACL