@inproceedings{gong-etal-2022-train,
title = "Can We Train a Language Model Inside an End-to-End {ASR} Model? - Investigating Effective Implicit Language Modeling",
author = "Gong, Zhuo and
Saito, Daisuke and
Li, Sheng and
Kawai, Hisashi and
Minematsu, Nobuaki",
editor = "Wu, Xianchao and
Ruan, Peiying and
Li, Sheng and
Dong, Yi",
booktitle = "Proceedings of the Second Workshop on When Creative AI Meets Conversational AI",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.cai-1.6",
pages = "42--47",
abstract = "Language models (LM) have played crucial roles in automatic speech recognition (ASR) to enhance end-to-end (E2E) ASR systems{'} performance. There are two categories of approaches: finding better ways to integrate LMs into ASR systems and adapting on LMs to the task domain. This article will start with a reflection of interpolation-based integration methods of E2E ASR{'}s scores and LM{'}s scores. Then we will focus on LM augmentation approaches based on the noisy channel model, which is intrigued by insights obtained from the above reflection. The experiments show that we can enhance an ASR E2E model based on encoder-decoder architecture by pre-training the decoder with text data. This implies the decoder of an E2E model can be treated as an LM and reveals the possibility of enhancing the E2E model without an external LM. Based on those ideas, we proposed the implicit language model canceling method and then did more discussion about the decoder part of an E2E ASR model. The experimental results on the TED-LIUM2 dataset show that our approach achieves a 3.4{\%} relative WER reduction compared with the baseline system, and more analytic experiments provide concrete experimental supports for our assumption.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gong-etal-2022-train">
<titleInfo>
<title>Can We Train a Language Model Inside an End-to-End ASR Model? - Investigating Effective Implicit Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhuo</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisuke</namePart>
<namePart type="family">Saito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hisashi</namePart>
<namePart type="family">Kawai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nobuaki</namePart>
<namePart type="family">Minematsu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on When Creative AI Meets Conversational AI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xianchao</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peiying</namePart>
<namePart type="family">Ruan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language models (LM) have played crucial roles in automatic speech recognition (ASR) to enhance end-to-end (E2E) ASR systems’ performance. There are two categories of approaches: finding better ways to integrate LMs into ASR systems and adapting on LMs to the task domain. This article will start with a reflection of interpolation-based integration methods of E2E ASR’s scores and LM’s scores. Then we will focus on LM augmentation approaches based on the noisy channel model, which is intrigued by insights obtained from the above reflection. The experiments show that we can enhance an ASR E2E model based on encoder-decoder architecture by pre-training the decoder with text data. This implies the decoder of an E2E model can be treated as an LM and reveals the possibility of enhancing the E2E model without an external LM. Based on those ideas, we proposed the implicit language model canceling method and then did more discussion about the decoder part of an E2E ASR model. The experimental results on the TED-LIUM2 dataset show that our approach achieves a 3.4% relative WER reduction compared with the baseline system, and more analytic experiments provide concrete experimental supports for our assumption.</abstract>
<identifier type="citekey">gong-etal-2022-train</identifier>
<location>
<url>https://aclanthology.org/2022.cai-1.6</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>42</start>
<end>47</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can We Train a Language Model Inside an End-to-End ASR Model? - Investigating Effective Implicit Language Modeling
%A Gong, Zhuo
%A Saito, Daisuke
%A Li, Sheng
%A Kawai, Hisashi
%A Minematsu, Nobuaki
%Y Wu, Xianchao
%Y Ruan, Peiying
%Y Li, Sheng
%Y Dong, Yi
%S Proceedings of the Second Workshop on When Creative AI Meets Conversational AI
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F gong-etal-2022-train
%X Language models (LM) have played crucial roles in automatic speech recognition (ASR) to enhance end-to-end (E2E) ASR systems’ performance. There are two categories of approaches: finding better ways to integrate LMs into ASR systems and adapting on LMs to the task domain. This article will start with a reflection of interpolation-based integration methods of E2E ASR’s scores and LM’s scores. Then we will focus on LM augmentation approaches based on the noisy channel model, which is intrigued by insights obtained from the above reflection. The experiments show that we can enhance an ASR E2E model based on encoder-decoder architecture by pre-training the decoder with text data. This implies the decoder of an E2E model can be treated as an LM and reveals the possibility of enhancing the E2E model without an external LM. Based on those ideas, we proposed the implicit language model canceling method and then did more discussion about the decoder part of an E2E ASR model. The experimental results on the TED-LIUM2 dataset show that our approach achieves a 3.4% relative WER reduction compared with the baseline system, and more analytic experiments provide concrete experimental supports for our assumption.
%U https://aclanthology.org/2022.cai-1.6
%P 42-47
Markdown (Informal)
[Can We Train a Language Model Inside an End-to-End ASR Model? - Investigating Effective Implicit Language Modeling](https://aclanthology.org/2022.cai-1.6) (Gong et al., CAI 2022)
ACL