@inproceedings{shi-etal-2026-ultraeval,
title = "{U}ltra{E}val-Audio: A Unified Framework for Comprehensive Evaluation of Audio Foundation Models",
author = "Shi, Qundong and
Zhou, Jie and
Lin, Biyuan and
Cui, Junbo and
Zeng, Guoyang and
Zhou, Yixuan and
Wang, Ziyang and
Liu, Xin and
Luo, Zhen and
Wang, Yudong and
Liu, Zhiyuan",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-demo.56/",
pages = "566--577",
ISBN = "979-8-89176-392-0",
abstract = "The development of audio foundation models has accelerated rapidly since the emergence of GPT-4o. However, the lack of comprehensive evaluation has become a critical bottleneck for further progress in the field, particularly in audio generation. Current audio evaluation faces three major challenges: (1) audio evaluation lacks a unified framework, with datasets and code scattered across various sources;(2) audio codec, as a key component of audio foundation models, lacks a widely accepted and holistic evaluation methodology; (3) existing speech benchmarks are heavily reliant on English, making it challenging to objectively assess models' performance on Chinese.We introduce UltraEval-Audio, a unified framework addressing these challenges through a modular architecture supporting 10 languages, 14 task categories, 24 models, and 36 benchmarks with one-command evaluation and real-time leaderboards. For audio codec, we propose a three-dimensional evaluation scheme covering semantic accuracy, timbre fidelity, and acoustic quality. For Chinese evaluation, we introduce two new benchmarks: SpeechCMMLU and SpeechHSK. Our code, benchmarks, and leaderboards are available at https://github.com/OpenBMB/UltraEval-Audio."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2026-ultraeval">
<titleInfo>
<title>UltraEval-Audio: A Unified Framework for Comprehensive Evaluation of Audio Foundation Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qundong</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biyuan</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junbo</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guoyang</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yixuan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yudong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-392-0</identifier>
</relatedItem>
<abstract>The development of audio foundation models has accelerated rapidly since the emergence of GPT-4o. However, the lack of comprehensive evaluation has become a critical bottleneck for further progress in the field, particularly in audio generation. Current audio evaluation faces three major challenges: (1) audio evaluation lacks a unified framework, with datasets and code scattered across various sources;(2) audio codec, as a key component of audio foundation models, lacks a widely accepted and holistic evaluation methodology; (3) existing speech benchmarks are heavily reliant on English, making it challenging to objectively assess models’ performance on Chinese.We introduce UltraEval-Audio, a unified framework addressing these challenges through a modular architecture supporting 10 languages, 14 task categories, 24 models, and 36 benchmarks with one-command evaluation and real-time leaderboards. For audio codec, we propose a three-dimensional evaluation scheme covering semantic accuracy, timbre fidelity, and acoustic quality. For Chinese evaluation, we introduce two new benchmarks: SpeechCMMLU and SpeechHSK. Our code, benchmarks, and leaderboards are available at https://github.com/OpenBMB/UltraEval-Audio.</abstract>
<identifier type="citekey">shi-etal-2026-ultraeval</identifier>
<location>
<url>https://aclanthology.org/2026.acl-demo.56/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>566</start>
<end>577</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UltraEval-Audio: A Unified Framework for Comprehensive Evaluation of Audio Foundation Models
%A Shi, Qundong
%A Zhou, Jie
%A Lin, Biyuan
%A Cui, Junbo
%A Zeng, Guoyang
%A Zhou, Yixuan
%A Wang, Ziyang
%A Liu, Xin
%A Luo, Zhen
%A Wang, Yudong
%A Liu, Zhiyuan
%Y Durrett, Greg
%Y Jian, Ping
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-392-0
%F shi-etal-2026-ultraeval
%X The development of audio foundation models has accelerated rapidly since the emergence of GPT-4o. However, the lack of comprehensive evaluation has become a critical bottleneck for further progress in the field, particularly in audio generation. Current audio evaluation faces three major challenges: (1) audio evaluation lacks a unified framework, with datasets and code scattered across various sources;(2) audio codec, as a key component of audio foundation models, lacks a widely accepted and holistic evaluation methodology; (3) existing speech benchmarks are heavily reliant on English, making it challenging to objectively assess models’ performance on Chinese.We introduce UltraEval-Audio, a unified framework addressing these challenges through a modular architecture supporting 10 languages, 14 task categories, 24 models, and 36 benchmarks with one-command evaluation and real-time leaderboards. For audio codec, we propose a three-dimensional evaluation scheme covering semantic accuracy, timbre fidelity, and acoustic quality. For Chinese evaluation, we introduce two new benchmarks: SpeechCMMLU and SpeechHSK. Our code, benchmarks, and leaderboards are available at https://github.com/OpenBMB/UltraEval-Audio.
%U https://aclanthology.org/2026.acl-demo.56/
%P 566-577
Markdown (Informal)
[UltraEval-Audio: A Unified Framework for Comprehensive Evaluation of Audio Foundation Models](https://aclanthology.org/2026.acl-demo.56/) (Shi et al., ACL 2026)
ACL
- Qundong Shi, Jie Zhou, Biyuan Lin, Junbo Cui, Guoyang Zeng, Yixuan Zhou, Ziyang Wang, Xin Liu, Zhen Luo, Yudong Wang, and Zhiyuan Liu. 2026. UltraEval-Audio: A Unified Framework for Comprehensive Evaluation of Audio Foundation Models. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations), pages 566–577, San Diego, California, United States. Association for Computational Linguistics.