@inproceedings{liu-etal-2024-important,
title = "How Important is a Language Model for Low-resource {ASR}?",
author = "Liu, Zoey and
Venkateswaran, Nitin and
Le Ferrand, Eric and
Prud{'}hommeaux, Emily",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.13",
doi = "10.18653/v1/2024.findings-acl.13",
pages = "206--213",
abstract = "N-gram language models (LMs) are the innovation that first made large-vocabulary continuous automatic speech recognition (ASR) viable. With neural end-to-end ASR architectures, however, LMs have become an afterthought. While the effect on accuracy may be negligible for English and Mandarin, jettisoning the LM might not make sense for the world{'}s remaining 6000+ languages. In this paper, we investigate the role of the LM in low-resource ASR. First we ask: does using an n-gram LM in decoding in neural architectures help ASR performance? While it may seem obvious that it should, its absence in most implementations suggests otherwise. Second, we ask: when an n-gram LM is used in ASR, is there a relationship between the size of the LM and ASR accuracy? We have discovered that gut feelings on this question vary considerably, but there is little empirical work to support any particular claim. We explore these questions {``}in the wild{''} using a deliberately diverse set of 9 very small ASR corpora. The results show that: (1) decoding with an n-gram LM, regardless of its size, leads to lower word error rates; and (2) increasing the size of the LM appears to yield improvements only when the audio corpus itself is already relatively large. This suggests that collecting additional LM training text may benefit widely-spoken languages which typically have larger audio corpora. In contrast, for endangered languages where data of any kind will always be limited, efforts may be better spent collecting additional transcribed audio.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2024-important">
<titleInfo>
<title>How Important is a Language Model for Low-resource ASR?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zoey</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nitin</namePart>
<namePart type="family">Venkateswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Le Ferrand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Prud’hommeaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>N-gram language models (LMs) are the innovation that first made large-vocabulary continuous automatic speech recognition (ASR) viable. With neural end-to-end ASR architectures, however, LMs have become an afterthought. While the effect on accuracy may be negligible for English and Mandarin, jettisoning the LM might not make sense for the world’s remaining 6000+ languages. In this paper, we investigate the role of the LM in low-resource ASR. First we ask: does using an n-gram LM in decoding in neural architectures help ASR performance? While it may seem obvious that it should, its absence in most implementations suggests otherwise. Second, we ask: when an n-gram LM is used in ASR, is there a relationship between the size of the LM and ASR accuracy? We have discovered that gut feelings on this question vary considerably, but there is little empirical work to support any particular claim. We explore these questions “in the wild” using a deliberately diverse set of 9 very small ASR corpora. The results show that: (1) decoding with an n-gram LM, regardless of its size, leads to lower word error rates; and (2) increasing the size of the LM appears to yield improvements only when the audio corpus itself is already relatively large. This suggests that collecting additional LM training text may benefit widely-spoken languages which typically have larger audio corpora. In contrast, for endangered languages where data of any kind will always be limited, efforts may be better spent collecting additional transcribed audio.</abstract>
<identifier type="citekey">liu-etal-2024-important</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.13</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.13</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>206</start>
<end>213</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Important is a Language Model for Low-resource ASR?
%A Liu, Zoey
%A Venkateswaran, Nitin
%A Le Ferrand, Eric
%A Prud’hommeaux, Emily
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F liu-etal-2024-important
%X N-gram language models (LMs) are the innovation that first made large-vocabulary continuous automatic speech recognition (ASR) viable. With neural end-to-end ASR architectures, however, LMs have become an afterthought. While the effect on accuracy may be negligible for English and Mandarin, jettisoning the LM might not make sense for the world’s remaining 6000+ languages. In this paper, we investigate the role of the LM in low-resource ASR. First we ask: does using an n-gram LM in decoding in neural architectures help ASR performance? While it may seem obvious that it should, its absence in most implementations suggests otherwise. Second, we ask: when an n-gram LM is used in ASR, is there a relationship between the size of the LM and ASR accuracy? We have discovered that gut feelings on this question vary considerably, but there is little empirical work to support any particular claim. We explore these questions “in the wild” using a deliberately diverse set of 9 very small ASR corpora. The results show that: (1) decoding with an n-gram LM, regardless of its size, leads to lower word error rates; and (2) increasing the size of the LM appears to yield improvements only when the audio corpus itself is already relatively large. This suggests that collecting additional LM training text may benefit widely-spoken languages which typically have larger audio corpora. In contrast, for endangered languages where data of any kind will always be limited, efforts may be better spent collecting additional transcribed audio.
%R 10.18653/v1/2024.findings-acl.13
%U https://aclanthology.org/2024.findings-acl.13
%U https://doi.org/10.18653/v1/2024.findings-acl.13
%P 206-213
Markdown (Informal)
[How Important is a Language Model for Low-resource ASR?](https://aclanthology.org/2024.findings-acl.13) (Liu et al., Findings 2024)
ACL
- Zoey Liu, Nitin Venkateswaran, Eric Le Ferrand, and Emily Prud’hommeaux. 2024. How Important is a Language Model for Low-resource ASR?. In Findings of the Association for Computational Linguistics: ACL 2024, pages 206–213, Bangkok, Thailand. Association for Computational Linguistics.