@inproceedings{du-etal-2026-mcga,
title = "{MCGA}: A Multi-task Classical {C}hinese Literary Genre Audio Corpus",
author = "Du, Yexing and
Liu, Kaiyuan and
Zhang, Bihe and
Pan, Youcheng and
Yang, Bo and
Huo, Liangyu and
Zhang, Xiyuan and
Xie, Jian and
He, Daojing and
Xiang, Yang and
Liu, Ming and
Qin, Bing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1543/",
pages = "30857--30866",
ISBN = "979-8-89176-395-1",
abstract = "With the rapid advancement of Multimodal Large Language Models (MLLMs), their potential has gained significant attention in Chinese Classical Studies (CCS). While existing research primarily focuses on text and visual modalities, the audio corpus within this domain remains largely underexplored. To bridge this gap, we introduce the Multi-task Classical Chinese Literary Genre Audio Corpus (MCGA), a 119-hour corpus comprising 22,000 audio samples. It encompasses a diverse range of literary genres across six tasks: Automatic Speech Recognition (ASR), Speech-to-Text Translation (S2TT), Speech Emotion Captioning (SEC), Spoken Question Answering (SQA), Speech Understanding (SU), and Speech Reasoning (SR). Through the evaluation of ten MLLMs, our experimental results demonstrate that current MLLMs still face substantial challenges on the MCGA test set. Furthermore, we introduce a domain-specific metric for SEC and a metric to measure the consistency between speech and text capabilities. We release MCGA to the public to facilitate the development of more robust MLLMs. MCGA Corpus: https://github.com/yxduir/MCGA"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="du-etal-2026-mcga">
<titleInfo>
<title>MCGA: A Multi-task Classical Chinese Literary Genre Audio Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yexing</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaiyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bihe</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youcheng</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liangyu</namePart>
<namePart type="family">Huo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiyuan</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daojing</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bing</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>With the rapid advancement of Multimodal Large Language Models (MLLMs), their potential has gained significant attention in Chinese Classical Studies (CCS). While existing research primarily focuses on text and visual modalities, the audio corpus within this domain remains largely underexplored. To bridge this gap, we introduce the Multi-task Classical Chinese Literary Genre Audio Corpus (MCGA), a 119-hour corpus comprising 22,000 audio samples. It encompasses a diverse range of literary genres across six tasks: Automatic Speech Recognition (ASR), Speech-to-Text Translation (S2TT), Speech Emotion Captioning (SEC), Spoken Question Answering (SQA), Speech Understanding (SU), and Speech Reasoning (SR). Through the evaluation of ten MLLMs, our experimental results demonstrate that current MLLMs still face substantial challenges on the MCGA test set. Furthermore, we introduce a domain-specific metric for SEC and a metric to measure the consistency between speech and text capabilities. We release MCGA to the public to facilitate the development of more robust MLLMs. MCGA Corpus: https://github.com/yxduir/MCGA</abstract>
<identifier type="citekey">du-etal-2026-mcga</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1543/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30857</start>
<end>30866</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MCGA: A Multi-task Classical Chinese Literary Genre Audio Corpus
%A Du, Yexing
%A Liu, Kaiyuan
%A Zhang, Bihe
%A Pan, Youcheng
%A Yang, Bo
%A Huo, Liangyu
%A Zhang, Xiyuan
%A Xie, Jian
%A He, Daojing
%A Xiang, Yang
%A Liu, Ming
%A Qin, Bing
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F du-etal-2026-mcga
%X With the rapid advancement of Multimodal Large Language Models (MLLMs), their potential has gained significant attention in Chinese Classical Studies (CCS). While existing research primarily focuses on text and visual modalities, the audio corpus within this domain remains largely underexplored. To bridge this gap, we introduce the Multi-task Classical Chinese Literary Genre Audio Corpus (MCGA), a 119-hour corpus comprising 22,000 audio samples. It encompasses a diverse range of literary genres across six tasks: Automatic Speech Recognition (ASR), Speech-to-Text Translation (S2TT), Speech Emotion Captioning (SEC), Spoken Question Answering (SQA), Speech Understanding (SU), and Speech Reasoning (SR). Through the evaluation of ten MLLMs, our experimental results demonstrate that current MLLMs still face substantial challenges on the MCGA test set. Furthermore, we introduce a domain-specific metric for SEC and a metric to measure the consistency between speech and text capabilities. We release MCGA to the public to facilitate the development of more robust MLLMs. MCGA Corpus: https://github.com/yxduir/MCGA
%U https://aclanthology.org/2026.findings-acl.1543/
%P 30857-30866
Markdown (Informal)
[MCGA: A Multi-task Classical Chinese Literary Genre Audio Corpus](https://aclanthology.org/2026.findings-acl.1543/) (Du et al., Findings 2026)
ACL
- Yexing Du, Kaiyuan Liu, Bihe Zhang, Youcheng Pan, Bo Yang, Liangyu Huo, Xiyuan Zhang, Jian Xie, Daojing He, Yang Xiang, Ming Liu, and Bing Qin. 2026. MCGA: A Multi-task Classical Chinese Literary Genre Audio Corpus. In Findings of the Association for Computational Linguistics: ACL 2026, pages 30857–30866, San Diego, California, United States. Association for Computational Linguistics.