@inproceedings{mittal-etal-2022-partitioned,
title = "Partitioned Gradient Matching-based Data Subset Selection for Compute-Efficient Robust {ASR} Training",
author = "Mittal, Ashish and
Sivasubramanian, Durga and
Iyer, Rishabh and
Jyothi, Preethi and
Ramakrishnan, Ganesh",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.443",
doi = "10.18653/v1/2022.findings-emnlp.443",
pages = "5999--6010",
abstract = "Training state-of-the-art ASR systems such as RNN-T often has a high associated financial and environmental cost. Training with a subset of training data could mitigate this problem if the subset selected could achieve on-par performance with training with the entire dataset. Although there are many data subset selection(DSS) algorithms, direct application to the RNN-T is difficult, especially the DSS algorithms that are adaptive and use learning dynamics such as gradients, as RNN-T tend to have gradients with a significantly larger memory footprint. In this paper, we propose Partitioned Gradient Matching (PGM) a novel distributable DSS algorithm, suitable for massive datasets like those used to train RNN-T. Through extensive experiments on Librispeech 100H and Librispeech 960H, we show that PGM achieves between 3x to 6x speedup with only a very small accuracy degradation (under 1{\%} absolute WER difference). In addition, we demonstrate similar results for PGM even in settings where the training data is corrupted with noise.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mittal-etal-2022-partitioned">
<titleInfo>
<title>Partitioned Gradient Matching-based Data Subset Selection for Compute-Efficient Robust ASR Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Mittal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Durga</namePart>
<namePart type="family">Sivasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishabh</namePart>
<namePart type="family">Iyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Jyothi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="family">Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Training state-of-the-art ASR systems such as RNN-T often has a high associated financial and environmental cost. Training with a subset of training data could mitigate this problem if the subset selected could achieve on-par performance with training with the entire dataset. Although there are many data subset selection(DSS) algorithms, direct application to the RNN-T is difficult, especially the DSS algorithms that are adaptive and use learning dynamics such as gradients, as RNN-T tend to have gradients with a significantly larger memory footprint. In this paper, we propose Partitioned Gradient Matching (PGM) a novel distributable DSS algorithm, suitable for massive datasets like those used to train RNN-T. Through extensive experiments on Librispeech 100H and Librispeech 960H, we show that PGM achieves between 3x to 6x speedup with only a very small accuracy degradation (under 1% absolute WER difference). In addition, we demonstrate similar results for PGM even in settings where the training data is corrupted with noise.</abstract>
<identifier type="citekey">mittal-etal-2022-partitioned</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.443</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.443</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>5999</start>
<end>6010</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Partitioned Gradient Matching-based Data Subset Selection for Compute-Efficient Robust ASR Training
%A Mittal, Ashish
%A Sivasubramanian, Durga
%A Iyer, Rishabh
%A Jyothi, Preethi
%A Ramakrishnan, Ganesh
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F mittal-etal-2022-partitioned
%X Training state-of-the-art ASR systems such as RNN-T often has a high associated financial and environmental cost. Training with a subset of training data could mitigate this problem if the subset selected could achieve on-par performance with training with the entire dataset. Although there are many data subset selection(DSS) algorithms, direct application to the RNN-T is difficult, especially the DSS algorithms that are adaptive and use learning dynamics such as gradients, as RNN-T tend to have gradients with a significantly larger memory footprint. In this paper, we propose Partitioned Gradient Matching (PGM) a novel distributable DSS algorithm, suitable for massive datasets like those used to train RNN-T. Through extensive experiments on Librispeech 100H and Librispeech 960H, we show that PGM achieves between 3x to 6x speedup with only a very small accuracy degradation (under 1% absolute WER difference). In addition, we demonstrate similar results for PGM even in settings where the training data is corrupted with noise.
%R 10.18653/v1/2022.findings-emnlp.443
%U https://aclanthology.org/2022.findings-emnlp.443
%U https://doi.org/10.18653/v1/2022.findings-emnlp.443
%P 5999-6010
Markdown (Informal)
[Partitioned Gradient Matching-based Data Subset Selection for Compute-Efficient Robust ASR Training](https://aclanthology.org/2022.findings-emnlp.443) (Mittal et al., Findings 2022)
ACL