@inproceedings{gosai-etal-2026-audio,
title = "Audio {M}ulti{C}hallenge: A Multi-Turn Evaluation of Spoken Dialogue Systems on Natural Human Interaction",
author = "Gosai, Advait and
Vuong, Tyler and
Tyagi, Utkarsh and
Li, Steven and
You, Wenjia and
Bavare, Miheer and
U{\c{c}}ar, Arda and
Fang, Zhongwang and
Jang, Brian and
Liu, Bing and
He, Yunzhong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.1654/",
pages = "35740--35770",
ISBN = "979-8-89176-390-6",
abstract = "End-to-end (E2E) spoken dialogue systems are replacing cascaded pipelines for voice-based human-AI interaction. Existing benchmarks primarily evaluate these systems on synthetic speech and single-turn tasks, leaving multi-turn conversational ability underexplored. We introduce Audio MultiChallenge an open-source benchmark to evaluate these systems under natural multi-turn interaction patterns. Building on the text-based MultiChallenge framework, which evaluates Inference Memory, Instruction Retention, and Self Coherence, we introduce a new axis Voice Editing that tests robustness to mid-utterance speech repairs and backtracking. We augment each axis to the audio modality, such as introducing Audio-Cue challenges for Inference Memory that require recalling ambient sounds and paralinguistic signals beyond semantic content. We curate 452 conversations from 47 speakers with 1,712 instance-specific rubrics through a hybrid pipeline that exposes model failures at scale while preserving natural disfluencies found in unscripted human speech. Our evaluation reveals that even frontier models struggle on our benchmark, with our highest-performing model achieving a 54.65{\%} pass rate. Error analysis shows that models are not sufficiently robust to human speech when tracking instructions, edits, and audio cues, highlighting the need for improved audio-native multi-turn interaction capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gosai-etal-2026-audio">
<titleInfo>
<title>Audio MultiChallenge: A Multi-Turn Evaluation of Spoken Dialogue Systems on Natural Human Interaction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Advait</namePart>
<namePart type="family">Gosai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tyler</namePart>
<namePart type="family">Vuong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Utkarsh</namePart>
<namePart type="family">Tyagi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjia</namePart>
<namePart type="family">You</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miheer</namePart>
<namePart type="family">Bavare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arda</namePart>
<namePart type="family">Uçar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongwang</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Jang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunzhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>End-to-end (E2E) spoken dialogue systems are replacing cascaded pipelines for voice-based human-AI interaction. Existing benchmarks primarily evaluate these systems on synthetic speech and single-turn tasks, leaving multi-turn conversational ability underexplored. We introduce Audio MultiChallenge an open-source benchmark to evaluate these systems under natural multi-turn interaction patterns. Building on the text-based MultiChallenge framework, which evaluates Inference Memory, Instruction Retention, and Self Coherence, we introduce a new axis Voice Editing that tests robustness to mid-utterance speech repairs and backtracking. We augment each axis to the audio modality, such as introducing Audio-Cue challenges for Inference Memory that require recalling ambient sounds and paralinguistic signals beyond semantic content. We curate 452 conversations from 47 speakers with 1,712 instance-specific rubrics through a hybrid pipeline that exposes model failures at scale while preserving natural disfluencies found in unscripted human speech. Our evaluation reveals that even frontier models struggle on our benchmark, with our highest-performing model achieving a 54.65% pass rate. Error analysis shows that models are not sufficiently robust to human speech when tracking instructions, edits, and audio cues, highlighting the need for improved audio-native multi-turn interaction capabilities.</abstract>
<identifier type="citekey">gosai-etal-2026-audio</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.1654/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35740</start>
<end>35770</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Audio MultiChallenge: A Multi-Turn Evaluation of Spoken Dialogue Systems on Natural Human Interaction
%A Gosai, Advait
%A Vuong, Tyler
%A Tyagi, Utkarsh
%A Li, Steven
%A You, Wenjia
%A Bavare, Miheer
%A Uçar, Arda
%A Fang, Zhongwang
%A Jang, Brian
%A Liu, Bing
%A He, Yunzhong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F gosai-etal-2026-audio
%X End-to-end (E2E) spoken dialogue systems are replacing cascaded pipelines for voice-based human-AI interaction. Existing benchmarks primarily evaluate these systems on synthetic speech and single-turn tasks, leaving multi-turn conversational ability underexplored. We introduce Audio MultiChallenge an open-source benchmark to evaluate these systems under natural multi-turn interaction patterns. Building on the text-based MultiChallenge framework, which evaluates Inference Memory, Instruction Retention, and Self Coherence, we introduce a new axis Voice Editing that tests robustness to mid-utterance speech repairs and backtracking. We augment each axis to the audio modality, such as introducing Audio-Cue challenges for Inference Memory that require recalling ambient sounds and paralinguistic signals beyond semantic content. We curate 452 conversations from 47 speakers with 1,712 instance-specific rubrics through a hybrid pipeline that exposes model failures at scale while preserving natural disfluencies found in unscripted human speech. Our evaluation reveals that even frontier models struggle on our benchmark, with our highest-performing model achieving a 54.65% pass rate. Error analysis shows that models are not sufficiently robust to human speech when tracking instructions, edits, and audio cues, highlighting the need for improved audio-native multi-turn interaction capabilities.
%U https://aclanthology.org/2026.acl-long.1654/
%P 35740-35770
Markdown (Informal)
[Audio MultiChallenge: A Multi-Turn Evaluation of Spoken Dialogue Systems on Natural Human Interaction](https://aclanthology.org/2026.acl-long.1654/) (Gosai et al., ACL 2026)
ACL
- Advait Gosai, Tyler Vuong, Utkarsh Tyagi, Steven Li, Wenjia You, Miheer Bavare, Arda Uçar, Zhongwang Fang, Brian Jang, Bing Liu, and Yunzhong He. 2026. Audio MultiChallenge: A Multi-Turn Evaluation of Spoken Dialogue Systems on Natural Human Interaction. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 35740–35770, San Diego, California, United States. Association for Computational Linguistics.