@inproceedings{kothawade-etal-2023-ditto,
title = "{DITTO}: Data-efficient and Fair Targeted Subset Selection for {ASR} Accent Adaptation",
author = "Kothawade, Suraj and
Mekala, Anmol and
Hetha Havya, D.Chandra Sekhara and
Kothyari, Mayank and
Iyer, Rishabh and
Ramakrishnan, Ganesh and
Jyothi, Preethi",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.319",
doi = "10.18653/v1/2023.acl-long.319",
pages = "5810--5822",
abstract = "State-of-the-art Automatic Speech Recognition (ASR) systems are known to exhibit disparate performance on varying speech accents. To improve performance on a specific target accent, a commonly adopted solution is to finetune the ASR model using accent-specific labeled speech. However, acquiring large amounts of labeled speech for specific target accents is challenging. Choosing an informative subset of speech samples that are most representative of the target accents becomes important for effective ASR finetuning. To address this problem, we propose DITTO (Data-efficient and faIr Targeted subseT selectiOn that uses Submodular Mutual Information (SMI) functions as acquisition functions to find the most informative set of utterances matching a target accent within a fixed budget. An important feature of DITTO is that it supports fair targeting for multiple accents, i.e. it can automatically select representative data points from multiple accents when the ASR model needs to perform well on more than one accent. We show that compared to other speech selection methods, DITTO is 3-5 times as label-efficient for its improvements on the Indic-TTS and L2 datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kothawade-etal-2023-ditto">
<titleInfo>
<title>DITTO: Data-efficient and Fair Targeted Subset Selection for ASR Accent Adaptation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Suraj</namePart>
<namePart type="family">Kothawade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anmol</namePart>
<namePart type="family">Mekala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">D.Chandra</namePart>
<namePart type="given">Sekhara</namePart>
<namePart type="family">Hetha Havya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayank</namePart>
<namePart type="family">Kothyari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishabh</namePart>
<namePart type="family">Iyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="family">Ramakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Jyothi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>State-of-the-art Automatic Speech Recognition (ASR) systems are known to exhibit disparate performance on varying speech accents. To improve performance on a specific target accent, a commonly adopted solution is to finetune the ASR model using accent-specific labeled speech. However, acquiring large amounts of labeled speech for specific target accents is challenging. Choosing an informative subset of speech samples that are most representative of the target accents becomes important for effective ASR finetuning. To address this problem, we propose DITTO (Data-efficient and faIr Targeted subseT selectiOn that uses Submodular Mutual Information (SMI) functions as acquisition functions to find the most informative set of utterances matching a target accent within a fixed budget. An important feature of DITTO is that it supports fair targeting for multiple accents, i.e. it can automatically select representative data points from multiple accents when the ASR model needs to perform well on more than one accent. We show that compared to other speech selection methods, DITTO is 3-5 times as label-efficient for its improvements on the Indic-TTS and L2 datasets.</abstract>
<identifier type="citekey">kothawade-etal-2023-ditto</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.319</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.319</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>5810</start>
<end>5822</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DITTO: Data-efficient and Fair Targeted Subset Selection for ASR Accent Adaptation
%A Kothawade, Suraj
%A Mekala, Anmol
%A Hetha Havya, D.Chandra Sekhara
%A Kothyari, Mayank
%A Iyer, Rishabh
%A Ramakrishnan, Ganesh
%A Jyothi, Preethi
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F kothawade-etal-2023-ditto
%X State-of-the-art Automatic Speech Recognition (ASR) systems are known to exhibit disparate performance on varying speech accents. To improve performance on a specific target accent, a commonly adopted solution is to finetune the ASR model using accent-specific labeled speech. However, acquiring large amounts of labeled speech for specific target accents is challenging. Choosing an informative subset of speech samples that are most representative of the target accents becomes important for effective ASR finetuning. To address this problem, we propose DITTO (Data-efficient and faIr Targeted subseT selectiOn that uses Submodular Mutual Information (SMI) functions as acquisition functions to find the most informative set of utterances matching a target accent within a fixed budget. An important feature of DITTO is that it supports fair targeting for multiple accents, i.e. it can automatically select representative data points from multiple accents when the ASR model needs to perform well on more than one accent. We show that compared to other speech selection methods, DITTO is 3-5 times as label-efficient for its improvements on the Indic-TTS and L2 datasets.
%R 10.18653/v1/2023.acl-long.319
%U https://aclanthology.org/2023.acl-long.319
%U https://doi.org/10.18653/v1/2023.acl-long.319
%P 5810-5822
Markdown (Informal)
[DITTO: Data-efficient and Fair Targeted Subset Selection for ASR Accent Adaptation](https://aclanthology.org/2023.acl-long.319) (Kothawade et al., ACL 2023)
ACL
- Suraj Kothawade, Anmol Mekala, D.Chandra Sekhara Hetha Havya, Mayank Kothyari, Rishabh Iyer, Ganesh Ramakrishnan, and Preethi Jyothi. 2023. DITTO: Data-efficient and Fair Targeted Subset Selection for ASR Accent Adaptation. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 5810–5822, Toronto, Canada. Association for Computational Linguistics.