@inproceedings{wang-etal-2023-theory,
title = "A Theory of Unsupervised Speech Recognition",
author = "Wang, Liming and
Hasegawa-Johnson, Mark and
Yoo, Chang",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.acl-long.67",
doi = "10.18653/v1/2023.acl-long.67",
pages = "1192--1215",
abstract = "Unsupervised speech recognition ({pasted macro {`}ASRU{'}}/) is the problem of learning automatic speech recognition (ASR) systems from \textit{unpaired} speech-only and text-only corpora. While various algorithms exist to solve this problem, a theoretical framework is missing to study their properties and address such issues as sensitivity to hyperparameters and training instability. In this paper, we proposed a general theoretical framework to study the properties of {pasted macro {`}ASRU{'}}/ systems based on random matrix theory and the theory of neural tangent kernels. Such a framework allows us to prove various learnability conditions and sample complexity bounds of {pasted macro {`}ASRU{'}}/. Extensive {pasted macro {`}ASRU{'}}/ experiments on synthetic languages with three classes of transition graphs provide strong empirical evidence for our theory (code available at \url{https://github.com/cactuswiththoughts/UnsupASRTheory.gitcactuswiththoughts/UnsupASRTheory.git}).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2023-theory">
<titleInfo>
<title>A Theory of Unsupervised Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Liming</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Hasegawa-Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chang</namePart>
<namePart type="family">Yoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Unsupervised speech recognition (pasted macro ‘ASRU’/) is the problem of learning automatic speech recognition (ASR) systems from unpaired speech-only and text-only corpora. While various algorithms exist to solve this problem, a theoretical framework is missing to study their properties and address such issues as sensitivity to hyperparameters and training instability. In this paper, we proposed a general theoretical framework to study the properties of pasted macro ‘ASRU’/ systems based on random matrix theory and the theory of neural tangent kernels. Such a framework allows us to prove various learnability conditions and sample complexity bounds of pasted macro ‘ASRU’/. Extensive pasted macro ‘ASRU’/ experiments on synthetic languages with three classes of transition graphs provide strong empirical evidence for our theory (code available at https://github.com/cactuswiththoughts/UnsupASRTheory.gitcactuswiththoughts/UnsupASRTheory.git).</abstract>
<identifier type="citekey">wang-etal-2023-theory</identifier>
<identifier type="doi">10.18653/v1/2023.acl-long.67</identifier>
<location>
<url>https://aclanthology.org/2023.acl-long.67</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>1192</start>
<end>1215</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Theory of Unsupervised Speech Recognition
%A Wang, Liming
%A Hasegawa-Johnson, Mark
%A Yoo, Chang
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F wang-etal-2023-theory
%X Unsupervised speech recognition (pasted macro ‘ASRU’/) is the problem of learning automatic speech recognition (ASR) systems from unpaired speech-only and text-only corpora. While various algorithms exist to solve this problem, a theoretical framework is missing to study their properties and address such issues as sensitivity to hyperparameters and training instability. In this paper, we proposed a general theoretical framework to study the properties of pasted macro ‘ASRU’/ systems based on random matrix theory and the theory of neural tangent kernels. Such a framework allows us to prove various learnability conditions and sample complexity bounds of pasted macro ‘ASRU’/. Extensive pasted macro ‘ASRU’/ experiments on synthetic languages with three classes of transition graphs provide strong empirical evidence for our theory (code available at https://github.com/cactuswiththoughts/UnsupASRTheory.gitcactuswiththoughts/UnsupASRTheory.git).
%R 10.18653/v1/2023.acl-long.67
%U https://aclanthology.org/2023.acl-long.67
%U https://doi.org/10.18653/v1/2023.acl-long.67
%P 1192-1215
Markdown (Informal)
[A Theory of Unsupervised Speech Recognition](https://aclanthology.org/2023.acl-long.67) (Wang et al., ACL 2023)
ACL
- Liming Wang, Mark Hasegawa-Johnson, and Chang Yoo. 2023. A Theory of Unsupervised Speech Recognition. In Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1192–1215, Toronto, Canada. Association for Computational Linguistics.