@inproceedings{lee-etal-2024-enhancing-dialogue,
title = "Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning",
author = "Lee, Wonjun and
Kim, San and
Lee, Gary Geunbae",
editor = "Kawahara, Tatsuya and
Demberg, Vera and
Ultes, Stefan and
Inoue, Koji and
Mehri, Shikib and
Howcroft, David and
Komatani, Kazunori",
booktitle = "Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = sep,
year = "2024",
address = "Kyoto, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sigdial-1.30",
doi = "10.18653/v1/2024.sigdial-1.30",
pages = "333--343",
abstract = "Recent dialogue systems typically operate through turn-based spoken interactions between users and agents. These systems heavily depend on accurate Automatic Speech Recognition (ASR), as transcription errors can significantly degrade performance in downstream dialogue tasks. To alleviate this challenge, robust ASR is required, and one effective method is to utilize the dialogue context from user and agent interactions for transcribing the subsequent user utterance. This method incorporates the transcription of the user{'}s speech and the agent{'}s response as model input, using the accumulated context generated by each turn. However, this context is susceptible to ASR errors because the ASR model generates it auto-regressively. Such noisy context can further degrade the benefits of context input, resulting in suboptimal ASR performance. In this paper, we introduce context noise representation learning to enhance robustness against noisy context, ultimately improving dialogue speech recognition accuracy. To maximize the advantage of context awareness, our approach involves decoder pre-training with text-based dialogue data and noise representation learning for a context encoder. Evaluated on DSTC11 (MultiWoZ 2.1 audio dialogues), it achieves a 24{\%} relative reduction in Word Error Rate (WER) compared to wav2vec2.0 baselines and a 13{\%} reduction compared to Whisper-large-v2. Notably, in noisy environments where user speech is barely audible, our method proves its effectiveness by utilizing contextual information for accurate transcription. Tested on audio data with strong noise level (Signal Noise Ratio of 0dB), our approach shows up to a 31{\%} relative WER reduction compared to the wav2vec2.0 baseline, providing a reassuring solution for real-world noisy scenarios.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2024-enhancing-dialogue">
<titleInfo>
<title>Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wonjun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">San</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gary</namePart>
<namePart type="given">Geunbae</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefan</namePart>
<namePart type="family">Ultes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koji</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikib</namePart>
<namePart type="family">Mehri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Howcroft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazunori</namePart>
<namePart type="family">Komatani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyoto, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent dialogue systems typically operate through turn-based spoken interactions between users and agents. These systems heavily depend on accurate Automatic Speech Recognition (ASR), as transcription errors can significantly degrade performance in downstream dialogue tasks. To alleviate this challenge, robust ASR is required, and one effective method is to utilize the dialogue context from user and agent interactions for transcribing the subsequent user utterance. This method incorporates the transcription of the user’s speech and the agent’s response as model input, using the accumulated context generated by each turn. However, this context is susceptible to ASR errors because the ASR model generates it auto-regressively. Such noisy context can further degrade the benefits of context input, resulting in suboptimal ASR performance. In this paper, we introduce context noise representation learning to enhance robustness against noisy context, ultimately improving dialogue speech recognition accuracy. To maximize the advantage of context awareness, our approach involves decoder pre-training with text-based dialogue data and noise representation learning for a context encoder. Evaluated on DSTC11 (MultiWoZ 2.1 audio dialogues), it achieves a 24% relative reduction in Word Error Rate (WER) compared to wav2vec2.0 baselines and a 13% reduction compared to Whisper-large-v2. Notably, in noisy environments where user speech is barely audible, our method proves its effectiveness by utilizing contextual information for accurate transcription. Tested on audio data with strong noise level (Signal Noise Ratio of 0dB), our approach shows up to a 31% relative WER reduction compared to the wav2vec2.0 baseline, providing a reassuring solution for real-world noisy scenarios.</abstract>
<identifier type="citekey">lee-etal-2024-enhancing-dialogue</identifier>
<identifier type="doi">10.18653/v1/2024.sigdial-1.30</identifier>
<location>
<url>https://aclanthology.org/2024.sigdial-1.30</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>333</start>
<end>343</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning
%A Lee, Wonjun
%A Kim, San
%A Lee, Gary Geunbae
%Y Kawahara, Tatsuya
%Y Demberg, Vera
%Y Ultes, Stefan
%Y Inoue, Koji
%Y Mehri, Shikib
%Y Howcroft, David
%Y Komatani, Kazunori
%S Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2024
%8 September
%I Association for Computational Linguistics
%C Kyoto, Japan
%F lee-etal-2024-enhancing-dialogue
%X Recent dialogue systems typically operate through turn-based spoken interactions between users and agents. These systems heavily depend on accurate Automatic Speech Recognition (ASR), as transcription errors can significantly degrade performance in downstream dialogue tasks. To alleviate this challenge, robust ASR is required, and one effective method is to utilize the dialogue context from user and agent interactions for transcribing the subsequent user utterance. This method incorporates the transcription of the user’s speech and the agent’s response as model input, using the accumulated context generated by each turn. However, this context is susceptible to ASR errors because the ASR model generates it auto-regressively. Such noisy context can further degrade the benefits of context input, resulting in suboptimal ASR performance. In this paper, we introduce context noise representation learning to enhance robustness against noisy context, ultimately improving dialogue speech recognition accuracy. To maximize the advantage of context awareness, our approach involves decoder pre-training with text-based dialogue data and noise representation learning for a context encoder. Evaluated on DSTC11 (MultiWoZ 2.1 audio dialogues), it achieves a 24% relative reduction in Word Error Rate (WER) compared to wav2vec2.0 baselines and a 13% reduction compared to Whisper-large-v2. Notably, in noisy environments where user speech is barely audible, our method proves its effectiveness by utilizing contextual information for accurate transcription. Tested on audio data with strong noise level (Signal Noise Ratio of 0dB), our approach shows up to a 31% relative WER reduction compared to the wav2vec2.0 baseline, providing a reassuring solution for real-world noisy scenarios.
%R 10.18653/v1/2024.sigdial-1.30
%U https://aclanthology.org/2024.sigdial-1.30
%U https://doi.org/10.18653/v1/2024.sigdial-1.30
%P 333-343
Markdown (Informal)
[Enhancing Dialogue Speech Recognition with Robust Contextual Awareness via Noise Representation Learning](https://aclanthology.org/2024.sigdial-1.30) (Lee et al., SIGDIAL 2024)
ACL