@inproceedings{nguyen-etal-2024-improving,
title = "Improving Speech Recognition with Jargon Injection",
author = "Nguyen, Minh-Tien and
Nguyen, Dat Phuoc and
Luu, Tuan-Hai and
Nguyen, Xuan-Quang and
Nguyen, Tung-Duong and
Yang, Jeff",
editor = "Kawahara, Tatsuya and
Demberg, Vera and
Ultes, Stefan and
Inoue, Koji and
Mehri, Shikib and
Howcroft, David and
Komatani, Kazunori",
booktitle = "Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = sep,
year = "2024",
address = "Kyoto, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigdial-1.42",
doi = "10.18653/v1/2024.sigdial-1.42",
pages = "490--499",
abstract = "This paper introduces a new method that improves the performance of Automatic speech recognition (ASR) engines, e.g., Whisper in practical cases. Different from prior methods that usually require both speech data and its transcription for decoding, our method only uses jargon as the context for decoding. To do that, the method first represents the jargon in a trie tree structure for efficient storing and traversing. The method next forces the decoding of Whisper to more focus on the jargon by adjusting the probability of generated tokens with the use of the trie tree. To further improve the performance, the method utilizes the prompting method that uses the jargon as the context. Final tokens are generated based on the combination of prompting and decoding. Experimental results on Japanese and English datasets show that the proposed method helps to improve the performance of Whisper, specially for domain-specific data. The method is simple but effective and can be deployed to any encoder-decoder ASR engines in actual cases. The code and data are also accessible (https://shorturl.at/nMsaY).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nguyen-etal-2024-improving">
<titleInfo>
<title>Improving Speech Recognition with Jargon Injection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Minh-Tien</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dat</namePart>
<namePart type="given">Phuoc</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuan-Hai</namePart>
<namePart type="family">Luu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuan-Quang</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tung-Duong</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeff</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Ultes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikib</namePart>
<namePart type="family">Mehri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Howcroft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazunori</namePart>
<namePart type="family">Komatani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyoto, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces a new method that improves the performance of Automatic speech recognition (ASR) engines, e.g., Whisper in practical cases. Different from prior methods that usually require both speech data and its transcription for decoding, our method only uses jargon as the context for decoding. To do that, the method first represents the jargon in a trie tree structure for efficient storing and traversing. The method next forces the decoding of Whisper to more focus on the jargon by adjusting the probability of generated tokens with the use of the trie tree. To further improve the performance, the method utilizes the prompting method that uses the jargon as the context. Final tokens are generated based on the combination of prompting and decoding. Experimental results on Japanese and English datasets show that the proposed method helps to improve the performance of Whisper, specially for domain-specific data. The method is simple but effective and can be deployed to any encoder-decoder ASR engines in actual cases. The code and data are also accessible (https://shorturl.at/nMsaY).</abstract>
<identifier type="citekey">nguyen-etal-2024-improving</identifier>
<identifier type="doi">10.18653/v1/2024.sigdial-1.42</identifier>
<location>
<url>https://aclanthology.org/2024.sigdial-1.42</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>490</start>
<end>499</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Speech Recognition with Jargon Injection
%A Nguyen, Minh-Tien
%A Nguyen, Dat Phuoc
%A Luu, Tuan-Hai
%A Nguyen, Xuan-Quang
%A Nguyen, Tung-Duong
%A Yang, Jeff
%Y Kawahara, Tatsuya
%Y Demberg, Vera
%Y Ultes, Stefan
%Y Inoue, Koji
%Y Mehri, Shikib
%Y Howcroft, David
%Y Komatani, Kazunori
%S Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2024
%8 September
%I Association for Computational Linguistics
%C Kyoto, Japan
%F nguyen-etal-2024-improving
%X This paper introduces a new method that improves the performance of Automatic speech recognition (ASR) engines, e.g., Whisper in practical cases. Different from prior methods that usually require both speech data and its transcription for decoding, our method only uses jargon as the context for decoding. To do that, the method first represents the jargon in a trie tree structure for efficient storing and traversing. The method next forces the decoding of Whisper to more focus on the jargon by adjusting the probability of generated tokens with the use of the trie tree. To further improve the performance, the method utilizes the prompting method that uses the jargon as the context. Final tokens are generated based on the combination of prompting and decoding. Experimental results on Japanese and English datasets show that the proposed method helps to improve the performance of Whisper, specially for domain-specific data. The method is simple but effective and can be deployed to any encoder-decoder ASR engines in actual cases. The code and data are also accessible (https://shorturl.at/nMsaY).
%R 10.18653/v1/2024.sigdial-1.42
%U https://aclanthology.org/2024.sigdial-1.42
%U https://doi.org/10.18653/v1/2024.sigdial-1.42
%P 490-499
Markdown (Informal)
[Improving Speech Recognition with Jargon Injection](https://aclanthology.org/2024.sigdial-1.42) (Nguyen et al., SIGDIAL 2024)
ACL
- Minh-Tien Nguyen, Dat Phuoc Nguyen, Tuan-Hai Luu, Xuan-Quang Nguyen, Tung-Duong Nguyen, and Jeff Yang. 2024. Improving Speech Recognition with Jargon Injection. In Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue, pages 490–499, Kyoto, Japan. Association for Computational Linguistics.