@inproceedings{yamamoto-etal-2025-analysis,
title = "Analysis of Voice Activity Detection Errors in {API}-based Streaming {ASR} for Human-Robot Dialogue",
author = "Yamamoto, Kenta and
Takeda, Ryu and
Komatani, Kazunori",
editor = "Torres, Maria Ines and
Matsuda, Yuki and
Callejas, Zoraida and
del Pozo, Arantza and
D'Haro, Luis Fernando",
booktitle = "Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology",
month = may,
year = "2025",
address = "Bilbao, Spain",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.iwsds-1.26/",
pages = "245--253",
ISBN = "979-8-89176-248-0",
abstract = "In human-robot dialogue systems, streaming automatic speech recognition (ASR) services (e.g., Google ASR) are often utilized, with the microphone positioned close to the robot{'}s loudspeaker. Under these conditions, both the robot{'}s and the user{'}s utterances are captured, resulting in frequent failures to detect user speech. This study analyzes voice activity detection (VAD) errors by comparing results from such streaming ASR to those from standalone VAD models. Experiments conducted on three distinct dialogue datasets showed that streaming ASR tends to ignore user utterances immediately following system utterances. We discuss the underlying causes of these VAD errors and provide recommendations for improving VAD performance in human-robot dialogue."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamamoto-etal-2025-analysis">
<titleInfo>
<title>Analysis of Voice Activity Detection Errors in API-based Streaming ASR for Human-Robot Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenta</namePart>
<namePart type="family">Yamamoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryu</namePart>
<namePart type="family">Takeda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kazunori</namePart>
<namePart type="family">Komatani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuki</namePart>
<namePart type="family">Matsuda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arantza</namePart>
<namePart type="family">del Pozo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bilbao, Spain</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-248-0</identifier>
</relatedItem>
<abstract>In human-robot dialogue systems, streaming automatic speech recognition (ASR) services (e.g., Google ASR) are often utilized, with the microphone positioned close to the robot’s loudspeaker. Under these conditions, both the robot’s and the user’s utterances are captured, resulting in frequent failures to detect user speech. This study analyzes voice activity detection (VAD) errors by comparing results from such streaming ASR to those from standalone VAD models. Experiments conducted on three distinct dialogue datasets showed that streaming ASR tends to ignore user utterances immediately following system utterances. We discuss the underlying causes of these VAD errors and provide recommendations for improving VAD performance in human-robot dialogue.</abstract>
<identifier type="citekey">yamamoto-etal-2025-analysis</identifier>
<location>
<url>https://aclanthology.org/2025.iwsds-1.26/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>245</start>
<end>253</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analysis of Voice Activity Detection Errors in API-based Streaming ASR for Human-Robot Dialogue
%A Yamamoto, Kenta
%A Takeda, Ryu
%A Komatani, Kazunori
%Y Torres, Maria Ines
%Y Matsuda, Yuki
%Y Callejas, Zoraida
%Y del Pozo, Arantza
%Y D’Haro, Luis Fernando
%S Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology
%D 2025
%8 May
%I Association for Computational Linguistics
%C Bilbao, Spain
%@ 979-8-89176-248-0
%F yamamoto-etal-2025-analysis
%X In human-robot dialogue systems, streaming automatic speech recognition (ASR) services (e.g., Google ASR) are often utilized, with the microphone positioned close to the robot’s loudspeaker. Under these conditions, both the robot’s and the user’s utterances are captured, resulting in frequent failures to detect user speech. This study analyzes voice activity detection (VAD) errors by comparing results from such streaming ASR to those from standalone VAD models. Experiments conducted on three distinct dialogue datasets showed that streaming ASR tends to ignore user utterances immediately following system utterances. We discuss the underlying causes of these VAD errors and provide recommendations for improving VAD performance in human-robot dialogue.
%U https://aclanthology.org/2025.iwsds-1.26/
%P 245-253
Markdown (Informal)
[Analysis of Voice Activity Detection Errors in API-based Streaming ASR for Human-Robot Dialogue](https://aclanthology.org/2025.iwsds-1.26/) (Yamamoto et al., IWSDS 2025)
ACL