@inproceedings{miyazawa-sato-2025-transition,
title = "Transition Relevance Point Detection for Spoken Dialogue Systems with Self-Attention Transformer",
author = "Miyazawa, Kouki and
Sato, Yoshinao",
editor = "B{\'e}chet, Fr{\'e}d{\'e}ric and
Lef{\`e}vre, Fabrice and
Asher, Nicholas and
Kim, Seokhwan and
Merlin, Teva",
booktitle = "Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = aug,
year = "2025",
address = "Avignon, France",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigdial-1.21/",
pages = "268--274",
abstract = "Most conventional spoken dialogue systems determine when to respond based on the elapsed time of silence following user speech utterances. This approach often results in failures of turn-taking, disrupting smooth communications with users. This study addresses the detection of when it is acceptable for the dialogue system to start speaking. Specifically, we aim to detect transition relevant points (TRPs) rather than predict whether the dialogue participants will actually start speaking. To achieve this, we employ a self-supervised speech representation using contrastive predictive coding and a self-attention transformer. The proposed model, TRPDformer, was trained and evaluated on the corpus of everyday Japanese conversation. TRPDformer outperformed a baseline model based on the elapsed time of silence. Furthermore, third-party listeners rated the timing of system responses determined using the proposed model as superior to that of the baseline in a preference test."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miyazawa-sato-2025-transition">
<titleInfo>
<title>Transition Relevance Point Detection for Spoken Dialogue Systems with Self-Attention Transformer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kouki</namePart>
<namePart type="family">Miyazawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoshinao</namePart>
<namePart type="family">Sato</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrice</namePart>
<namePart type="family">Lefèvre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Asher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teva</namePart>
<namePart type="family">Merlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Avignon, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Most conventional spoken dialogue systems determine when to respond based on the elapsed time of silence following user speech utterances. This approach often results in failures of turn-taking, disrupting smooth communications with users. This study addresses the detection of when it is acceptable for the dialogue system to start speaking. Specifically, we aim to detect transition relevant points (TRPs) rather than predict whether the dialogue participants will actually start speaking. To achieve this, we employ a self-supervised speech representation using contrastive predictive coding and a self-attention transformer. The proposed model, TRPDformer, was trained and evaluated on the corpus of everyday Japanese conversation. TRPDformer outperformed a baseline model based on the elapsed time of silence. Furthermore, third-party listeners rated the timing of system responses determined using the proposed model as superior to that of the baseline in a preference test.</abstract>
<identifier type="citekey">miyazawa-sato-2025-transition</identifier>
<location>
<url>https://aclanthology.org/2025.sigdial-1.21/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>268</start>
<end>274</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transition Relevance Point Detection for Spoken Dialogue Systems with Self-Attention Transformer
%A Miyazawa, Kouki
%A Sato, Yoshinao
%Y Béchet, Frédéric
%Y Lefèvre, Fabrice
%Y Asher, Nicholas
%Y Kim, Seokhwan
%Y Merlin, Teva
%S Proceedings of the 26th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2025
%8 August
%I Association for Computational Linguistics
%C Avignon, France
%F miyazawa-sato-2025-transition
%X Most conventional spoken dialogue systems determine when to respond based on the elapsed time of silence following user speech utterances. This approach often results in failures of turn-taking, disrupting smooth communications with users. This study addresses the detection of when it is acceptable for the dialogue system to start speaking. Specifically, we aim to detect transition relevant points (TRPs) rather than predict whether the dialogue participants will actually start speaking. To achieve this, we employ a self-supervised speech representation using contrastive predictive coding and a self-attention transformer. The proposed model, TRPDformer, was trained and evaluated on the corpus of everyday Japanese conversation. TRPDformer outperformed a baseline model based on the elapsed time of silence. Furthermore, third-party listeners rated the timing of system responses determined using the proposed model as superior to that of the baseline in a preference test.
%U https://aclanthology.org/2025.sigdial-1.21/
%P 268-274
Markdown (Informal)
[Transition Relevance Point Detection for Spoken Dialogue Systems with Self-Attention Transformer](https://aclanthology.org/2025.sigdial-1.21/) (Miyazawa & Sato, SIGDIAL 2025)
ACL