@inproceedings{van-esch-2024-now,
title = "Now You See Me, Now You Don{'}t: {`}Poverty of the Stimulus{'} Problems and Arbitrary Correspondences in End-to-End Speech Models",
author = "van Esch, Daan",
editor = "Gorman, Kyle and
Prud'hommeaux, Emily and
Roark, Brian and
Sproat, Richard",
booktitle = "Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.cawl-1.6",
pages = "43--52",
abstract = "End-to-end models for speech recognition and speech synthesis have many benefits, but we argue they also face a unique set of challenges not encountered in conventional multi-stage hybrid systems, which relied on the explicit injection of linguistic knowledge through resources such as phonemic dictionaries and verbalization grammars. These challenges include handling words with unusual grapheme-to-phoneme correspondences, converting between written forms like {`}12{'} and spoken forms such as {`}twelve{'}, and contextual disambiguation of homophones or homographs. We describe the mitigation strategies that have been used for these problems in end-to-end systems, either implicitly or explicitly, and call out that the most commonly used mitigation techniques are likely incompatible with newly emerging approaches that use minimal amounts of supervised audio training data. We review best-of-both-world approaches that allow the use of end-to-end models combined with traditional linguistic resources, which we show are increasingly straightforward to create at scale, and close with an optimistic outlook for bringing speech technologies to many more languages by combining these strands of research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-esch-2024-now">
<titleInfo>
<title>Now You See Me, Now You Don’t: ‘Poverty of the Stimulus’ Problems and Arbitrary Correspondences in End-to-End Speech Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daan</namePart>
<namePart type="family">van Esch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Gorman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Prud’hommeaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Sproat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>End-to-end models for speech recognition and speech synthesis have many benefits, but we argue they also face a unique set of challenges not encountered in conventional multi-stage hybrid systems, which relied on the explicit injection of linguistic knowledge through resources such as phonemic dictionaries and verbalization grammars. These challenges include handling words with unusual grapheme-to-phoneme correspondences, converting between written forms like ‘12’ and spoken forms such as ‘twelve’, and contextual disambiguation of homophones or homographs. We describe the mitigation strategies that have been used for these problems in end-to-end systems, either implicitly or explicitly, and call out that the most commonly used mitigation techniques are likely incompatible with newly emerging approaches that use minimal amounts of supervised audio training data. We review best-of-both-world approaches that allow the use of end-to-end models combined with traditional linguistic resources, which we show are increasingly straightforward to create at scale, and close with an optimistic outlook for bringing speech technologies to many more languages by combining these strands of research.</abstract>
<identifier type="citekey">van-esch-2024-now</identifier>
<location>
<url>https://aclanthology.org/2024.cawl-1.6</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>43</start>
<end>52</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Now You See Me, Now You Don’t: ‘Poverty of the Stimulus’ Problems and Arbitrary Correspondences in End-to-End Speech Models
%A van Esch, Daan
%Y Gorman, Kyle
%Y Prud’hommeaux, Emily
%Y Roark, Brian
%Y Sproat, Richard
%S Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F van-esch-2024-now
%X End-to-end models for speech recognition and speech synthesis have many benefits, but we argue they also face a unique set of challenges not encountered in conventional multi-stage hybrid systems, which relied on the explicit injection of linguistic knowledge through resources such as phonemic dictionaries and verbalization grammars. These challenges include handling words with unusual grapheme-to-phoneme correspondences, converting between written forms like ‘12’ and spoken forms such as ‘twelve’, and contextual disambiguation of homophones or homographs. We describe the mitigation strategies that have been used for these problems in end-to-end systems, either implicitly or explicitly, and call out that the most commonly used mitigation techniques are likely incompatible with newly emerging approaches that use minimal amounts of supervised audio training data. We review best-of-both-world approaches that allow the use of end-to-end models combined with traditional linguistic resources, which we show are increasingly straightforward to create at scale, and close with an optimistic outlook for bringing speech technologies to many more languages by combining these strands of research.
%U https://aclanthology.org/2024.cawl-1.6
%P 43-52
Markdown (Informal)
[Now You See Me, Now You Don’t: ‘Poverty of the Stimulus’ Problems and Arbitrary Correspondences in End-to-End Speech Models](https://aclanthology.org/2024.cawl-1.6) (van Esch, CAWL-WS 2024)
ACL