@inproceedings{chiba-higashinaka-2025-investigating,
title = "Investigating the Impact of Incremental Processing and Voice Activity Projection on Spoken Dialogue Systems",
author = "Chiba, Yuya and
Higashinaka, Ryuichiro",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.249/",
pages = "3687--3696",
abstract = "The naturalness of responses in spoken dialogue systems has been significantly improved by the introduction of large language models (LLMs), although many challenges remain until human-like turn-taking can be achieved. A turn-taking model called Voice Activity Projection (VAP) is gaining attention because it can be trained in an unsupervised manner using the spoken dialogue data between two speakers. For such a turn-taking model to be fully effective, systems must initiate response generation as soon as a turn-shift is detected. This can be achieved by incremental response generation, which reduces the delay before the system responds. Incremental response generation is done using partial speech recognition results while user speech is incrementally processed. Combining incremental response generation with VAP-based turn-taking will enable spoken dialogue systems to achieve faster and more natural turn-taking. However, their effectiveness remains unclear because they have not yet been evaluated in real-world systems. In this study, we developed spoken dialogue systems that incorporate incremental response generation and VAP-based turn-taking and evaluated their impact on task success and dialogue satisfaction through user assessments."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chiba-higashinaka-2025-investigating">
<titleInfo>
<title>Investigating the Impact of Incremental Processing and Voice Activity Projection on Spoken Dialogue Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuya</namePart>
<namePart type="family">Chiba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryuichiro</namePart>
<namePart type="family">Higashinaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The naturalness of responses in spoken dialogue systems has been significantly improved by the introduction of large language models (LLMs), although many challenges remain until human-like turn-taking can be achieved. A turn-taking model called Voice Activity Projection (VAP) is gaining attention because it can be trained in an unsupervised manner using the spoken dialogue data between two speakers. For such a turn-taking model to be fully effective, systems must initiate response generation as soon as a turn-shift is detected. This can be achieved by incremental response generation, which reduces the delay before the system responds. Incremental response generation is done using partial speech recognition results while user speech is incrementally processed. Combining incremental response generation with VAP-based turn-taking will enable spoken dialogue systems to achieve faster and more natural turn-taking. However, their effectiveness remains unclear because they have not yet been evaluated in real-world systems. In this study, we developed spoken dialogue systems that incorporate incremental response generation and VAP-based turn-taking and evaluated their impact on task success and dialogue satisfaction through user assessments.</abstract>
<identifier type="citekey">chiba-higashinaka-2025-investigating</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.249/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>3687</start>
<end>3696</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Investigating the Impact of Incremental Processing and Voice Activity Projection on Spoken Dialogue Systems
%A Chiba, Yuya
%A Higashinaka, Ryuichiro
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F chiba-higashinaka-2025-investigating
%X The naturalness of responses in spoken dialogue systems has been significantly improved by the introduction of large language models (LLMs), although many challenges remain until human-like turn-taking can be achieved. A turn-taking model called Voice Activity Projection (VAP) is gaining attention because it can be trained in an unsupervised manner using the spoken dialogue data between two speakers. For such a turn-taking model to be fully effective, systems must initiate response generation as soon as a turn-shift is detected. This can be achieved by incremental response generation, which reduces the delay before the system responds. Incremental response generation is done using partial speech recognition results while user speech is incrementally processed. Combining incremental response generation with VAP-based turn-taking will enable spoken dialogue systems to achieve faster and more natural turn-taking. However, their effectiveness remains unclear because they have not yet been evaluated in real-world systems. In this study, we developed spoken dialogue systems that incorporate incremental response generation and VAP-based turn-taking and evaluated their impact on task success and dialogue satisfaction through user assessments.
%U https://aclanthology.org/2025.coling-main.249/
%P 3687-3696
Markdown (Informal)
[Investigating the Impact of Incremental Processing and Voice Activity Projection on Spoken Dialogue Systems](https://aclanthology.org/2025.coling-main.249/) (Chiba & Higashinaka, COLING 2025)
ACL