@inproceedings{takeda-komatani-2026-retrospective,
title = "Retrospective Speech Recognition for Spoken Dialogue System: Exploiting Subsequent Utterances to Enhance {ASR} Performance",
author = "Takeda, Ryu and
Komatani, Kazunori",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwsds-1.20/",
pages = "182--192",
abstract = "Spoken dialogue systems would benefit from the ability of self-correction, namely, {--}revising earlier recognition results once later utterances are available, as humans often do in dialogue. However, conventional automatic speech recognition ({ASR}) frameworks mainly process user utterances sequentially and rely only on the preceding context. To address this limitation, we propose Retrospective Speech Recognition ({RSR}), which refines past recognition results by exploiting its subsequent utterances. We formulate and implement an {RSR} model for a dialogue system situation where system utterances can also be utilized. Each past user utterance is processed with an interpretable syllabogram representation, which integrates preceding and subsequent utterances within a shared domain between the signal and text levels. This intermediate representation also helps reduce orthographic inconsistencies. Experimental results using real {J}apanese dialogue speech showed that utilizing the subsequent utterances improved the character error rate by 0.10 points, which demonstrates the utility of {RSR}. We also investigated the impact of other factors, such as utilization of system utterances."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="takeda-komatani-2026-retrospective">
<titleInfo>
<title>Retrospective Speech Recognition for Spoken Dialogue System: Exploiting Subsequent Utterances to Enhance ASR Performance</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ryu</namePart>
<namePart type="family">Takeda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazunori</namePart>
<namePart type="family">Komatani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-02</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Spoken Dialogue System Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Riccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Mahed</namePart>
<namePart type="family">Mousavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="given">”Pino”</namePart>
<namePart type="family">Di Fabbrizio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokhan</namePart>
<namePart type="family">Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Johnston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Trento, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Spoken dialogue systems would benefit from the ability of self-correction, namely, –revising earlier recognition results once later utterances are available, as humans often do in dialogue. However, conventional automatic speech recognition (ASR) frameworks mainly process user utterances sequentially and rely only on the preceding context. To address this limitation, we propose Retrospective Speech Recognition (RSR), which refines past recognition results by exploiting its subsequent utterances. We formulate and implement an RSR model for a dialogue system situation where system utterances can also be utilized. Each past user utterance is processed with an interpretable syllabogram representation, which integrates preceding and subsequent utterances within a shared domain between the signal and text levels. This intermediate representation also helps reduce orthographic inconsistencies. Experimental results using real Japanese dialogue speech showed that utilizing the subsequent utterances improved the character error rate by 0.10 points, which demonstrates the utility of RSR. We also investigated the impact of other factors, such as utilization of system utterances.</abstract>
<identifier type="citekey">takeda-komatani-2026-retrospective</identifier>
<location>
<url>https://aclanthology.org/2026.iwsds-1.20/</url>
</location>
<part>
<date>2026-02</date>
<extent unit="page">
<start>182</start>
<end>192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Retrospective Speech Recognition for Spoken Dialogue System: Exploiting Subsequent Utterances to Enhance ASR Performance
%A Takeda, Ryu
%A Komatani, Kazunori
%Y Riccardi, Giuseppe
%Y Mousavi, Seyed Mahed
%Y Torres, Maria Ines
%Y Yoshino, Koichiro
%Y Callejas, Zoraida
%Y Chowdhury, Shammur Absar
%Y Chen, Yun-Nung
%Y Bechet, Frederic
%Y Gustafson, Joakim
%Y Damnati, Géraldine
%Y Papangelis, Alex
%Y D’Haro, Luis Fernando
%Y Mendonça, John
%Y Bernardi, Raffaella
%Y Hakkani-Tur, Dilek
%Y Di Fabbrizio, Giuseppe ”Pino”
%Y Kawahara, Tatsuya
%Y Alam, Firoj
%Y Tur, Gokhan
%Y Johnston, Michael
%S Proceedings of the 16th International Workshop on Spoken Dialogue System Technology
%D 2026
%8 February
%I Association for Computational Linguistics
%C Trento, Italy
%F takeda-komatani-2026-retrospective
%X Spoken dialogue systems would benefit from the ability of self-correction, namely, –revising earlier recognition results once later utterances are available, as humans often do in dialogue. However, conventional automatic speech recognition (ASR) frameworks mainly process user utterances sequentially and rely only on the preceding context. To address this limitation, we propose Retrospective Speech Recognition (RSR), which refines past recognition results by exploiting its subsequent utterances. We formulate and implement an RSR model for a dialogue system situation where system utterances can also be utilized. Each past user utterance is processed with an interpretable syllabogram representation, which integrates preceding and subsequent utterances within a shared domain between the signal and text levels. This intermediate representation also helps reduce orthographic inconsistencies. Experimental results using real Japanese dialogue speech showed that utilizing the subsequent utterances improved the character error rate by 0.10 points, which demonstrates the utility of RSR. We also investigated the impact of other factors, such as utilization of system utterances.
%U https://aclanthology.org/2026.iwsds-1.20/
%P 182-192
Markdown (Informal)
[Retrospective Speech Recognition for Spoken Dialogue System: Exploiting Subsequent Utterances to Enhance ASR Performance](https://aclanthology.org/2026.iwsds-1.20/) (Takeda & Komatani, IWSDS 2026)
ACL