@inproceedings{prabhu-etal-2023-accented,
title = "Accented Speech Recognition With Accent-specific Codebooks",
author = "Prabhu, Darshan and
Jyothi, Preethi and
Ganapathy, Sriram and
Unni, Vinit",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.444",
doi = "10.18653/v1/2023.emnlp-main.444",
pages = "7175--7188",
abstract = "Speech accents pose a significant challenge to state-of-the-art automatic speech recognition (ASR) systems. Degradation in performance across underrepresented accents is a severe deterrent to the inclusive adoption of ASR. In this work, we propose a novel accent adaptation approach for end-to-end ASR systems using cross-attention with a trainable set of codebooks. These learnable codebooks capture accent-specific information and are integrated within the ASR encoder layers. The model is trained on accented English speech, while the test data also contained accents which were not seen during training. On the Mozilla Common Voice multi-accented dataset, we show that our proposed approach yields significant performance gains not only on the seen English accents (up to 37{\%} relative improvement in word error rate) but also on the unseen accents (up to 5{\%} relative improvement in WER). Further, we illustrate benefits for a zero-shot transfer setup on the L2Artic dataset. We also compare the performance with other approaches based on accent adversarial training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prabhu-etal-2023-accented">
<titleInfo>
<title>Accented Speech Recognition With Accent-specific Codebooks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Darshan</namePart>
<namePart type="family">Prabhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Jyothi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sriram</namePart>
<namePart type="family">Ganapathy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vinit</namePart>
<namePart type="family">Unni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Speech accents pose a significant challenge to state-of-the-art automatic speech recognition (ASR) systems. Degradation in performance across underrepresented accents is a severe deterrent to the inclusive adoption of ASR. In this work, we propose a novel accent adaptation approach for end-to-end ASR systems using cross-attention with a trainable set of codebooks. These learnable codebooks capture accent-specific information and are integrated within the ASR encoder layers. The model is trained on accented English speech, while the test data also contained accents which were not seen during training. On the Mozilla Common Voice multi-accented dataset, we show that our proposed approach yields significant performance gains not only on the seen English accents (up to 37% relative improvement in word error rate) but also on the unseen accents (up to 5% relative improvement in WER). Further, we illustrate benefits for a zero-shot transfer setup on the L2Artic dataset. We also compare the performance with other approaches based on accent adversarial training.</abstract>
<identifier type="citekey">prabhu-etal-2023-accented</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.444</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.444</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>7175</start>
<end>7188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Accented Speech Recognition With Accent-specific Codebooks
%A Prabhu, Darshan
%A Jyothi, Preethi
%A Ganapathy, Sriram
%A Unni, Vinit
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F prabhu-etal-2023-accented
%X Speech accents pose a significant challenge to state-of-the-art automatic speech recognition (ASR) systems. Degradation in performance across underrepresented accents is a severe deterrent to the inclusive adoption of ASR. In this work, we propose a novel accent adaptation approach for end-to-end ASR systems using cross-attention with a trainable set of codebooks. These learnable codebooks capture accent-specific information and are integrated within the ASR encoder layers. The model is trained on accented English speech, while the test data also contained accents which were not seen during training. On the Mozilla Common Voice multi-accented dataset, we show that our proposed approach yields significant performance gains not only on the seen English accents (up to 37% relative improvement in word error rate) but also on the unseen accents (up to 5% relative improvement in WER). Further, we illustrate benefits for a zero-shot transfer setup on the L2Artic dataset. We also compare the performance with other approaches based on accent adversarial training.
%R 10.18653/v1/2023.emnlp-main.444
%U https://aclanthology.org/2023.emnlp-main.444
%U https://doi.org/10.18653/v1/2023.emnlp-main.444
%P 7175-7188
Markdown (Informal)
[Accented Speech Recognition With Accent-specific Codebooks](https://aclanthology.org/2023.emnlp-main.444) (Prabhu et al., EMNLP 2023)
ACL
- Darshan Prabhu, Preethi Jyothi, Sriram Ganapathy, and Vinit Unni. 2023. Accented Speech Recognition With Accent-specific Codebooks. In Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 7175–7188, Singapore. Association for Computational Linguistics.