@inproceedings{shi-etal-2025-versa,
title = "{VERSA}: A Versatile Evaluation Toolkit for Speech, Audio, and Music",
author = "Shi, Jiatong and
Shim, Hye-jin and
Tian, Jinchuan and
Arora, Siddhant and
Wu, Haibin and
Petermann, Darius and
Yip, Jia Qi and
Zhang, You and
Tang, Yuxun and
Zhang, Wangyou and
Alharthi, Dareen Safar and
Huang, Yichen and
Saito, Koichi and
Han, Jionghao and
Zhao, Yiwen and
Donahue, Chris and
Watanabe, Shinji",
editor = "Dziri, Nouha and
Ren, Sean (Xiang) and
Diao, Shizhe",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-demo.19/",
doi = "10.18653/v1/2025.naacl-demo.19",
pages = "191--209",
ISBN = "979-8-89176-191-9",
abstract = "In this work, we introduce VERSA, a unified and standardized evaluation toolkit designed for various speech, audio, and music signals. The toolkit features a Pythonic interface with flexible configuration and dependency control, making it user-friendly and efficient. With full installation, VERSA offers 65 metrics with 729 metric variations based on different configurations. These metrics encompass evaluations utilizing diverse external resources, including matching and non-matching reference audio, text transcriptions, and text captions. As a lightweight yet comprehensive toolkit, VERSA is versatile to support the evaluation of a wide range of downstream scenarios. To demonstrate its capabilities, this work highlights example use cases for VERSA, including audio coding, speech synthesis, speech enhancement, singing synthesis, and music generation. The toolkit is available at https://github.com/shinjiwlab/versa."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2025-versa">
<titleInfo>
<title>VERSA: A Versatile Evaluation Toolkit for Speech, Audio, and Music</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiatong</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hye-jin</namePart>
<namePart type="family">Shim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinchuan</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siddhant</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haibin</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Darius</namePart>
<namePart type="family">Petermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jia</namePart>
<namePart type="given">Qi</namePart>
<namePart type="family">Yip</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">You</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxun</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wangyou</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dareen</namePart>
<namePart type="given">Safar</namePart>
<namePart type="family">Alharthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yichen</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichi</namePart>
<namePart type="family">Saito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jionghao</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiwen</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Donahue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shinji</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nouha</namePart>
<namePart type="family">Dziri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="given">(Xiang)</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhe</namePart>
<namePart type="family">Diao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-191-9</identifier>
</relatedItem>
<abstract>In this work, we introduce VERSA, a unified and standardized evaluation toolkit designed for various speech, audio, and music signals. The toolkit features a Pythonic interface with flexible configuration and dependency control, making it user-friendly and efficient. With full installation, VERSA offers 65 metrics with 729 metric variations based on different configurations. These metrics encompass evaluations utilizing diverse external resources, including matching and non-matching reference audio, text transcriptions, and text captions. As a lightweight yet comprehensive toolkit, VERSA is versatile to support the evaluation of a wide range of downstream scenarios. To demonstrate its capabilities, this work highlights example use cases for VERSA, including audio coding, speech synthesis, speech enhancement, singing synthesis, and music generation. The toolkit is available at https://github.com/shinjiwlab/versa.</abstract>
<identifier type="citekey">shi-etal-2025-versa</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-demo.19</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-demo.19/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>191</start>
<end>209</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VERSA: A Versatile Evaluation Toolkit for Speech, Audio, and Music
%A Shi, Jiatong
%A Shim, Hye-jin
%A Tian, Jinchuan
%A Arora, Siddhant
%A Wu, Haibin
%A Petermann, Darius
%A Yip, Jia Qi
%A Zhang, You
%A Tang, Yuxun
%A Zhang, Wangyou
%A Alharthi, Dareen Safar
%A Huang, Yichen
%A Saito, Koichi
%A Han, Jionghao
%A Zhao, Yiwen
%A Donahue, Chris
%A Watanabe, Shinji
%Y Dziri, Nouha
%Y Ren, Sean (Xiang)
%Y Diao, Shizhe
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-191-9
%F shi-etal-2025-versa
%X In this work, we introduce VERSA, a unified and standardized evaluation toolkit designed for various speech, audio, and music signals. The toolkit features a Pythonic interface with flexible configuration and dependency control, making it user-friendly and efficient. With full installation, VERSA offers 65 metrics with 729 metric variations based on different configurations. These metrics encompass evaluations utilizing diverse external resources, including matching and non-matching reference audio, text transcriptions, and text captions. As a lightweight yet comprehensive toolkit, VERSA is versatile to support the evaluation of a wide range of downstream scenarios. To demonstrate its capabilities, this work highlights example use cases for VERSA, including audio coding, speech synthesis, speech enhancement, singing synthesis, and music generation. The toolkit is available at https://github.com/shinjiwlab/versa.
%R 10.18653/v1/2025.naacl-demo.19
%U https://aclanthology.org/2025.naacl-demo.19/
%U https://doi.org/10.18653/v1/2025.naacl-demo.19
%P 191-209
Markdown (Informal)
[VERSA: A Versatile Evaluation Toolkit for Speech, Audio, and Music](https://aclanthology.org/2025.naacl-demo.19/) (Shi et al., NAACL 2025)
ACL
- Jiatong Shi, Hye-jin Shim, Jinchuan Tian, Siddhant Arora, Haibin Wu, Darius Petermann, Jia Qi Yip, You Zhang, Yuxun Tang, Wangyou Zhang, Dareen Safar Alharthi, Yichen Huang, Koichi Saito, Jionghao Han, Yiwen Zhao, Chris Donahue, and Shinji Watanabe. 2025. VERSA: A Versatile Evaluation Toolkit for Speech, Audio, and Music. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (System Demonstrations), pages 191–209, Albuquerque, New Mexico. Association for Computational Linguistics.