@inproceedings{zhou-etal-2026-childtalk,
title = "{C}hild{T}alk: A Multi-Dialect {C}hinese Child Speech Corpus with Full-Length Child{--}Caregiver Conversations for Speech Recognition",
author = "Zhou, Jiaming and
Guo, Yujie and
Zhao, Shiwan and
Lu, Yao and
Wang, Jianye and
Sun, Haoqin and
Wang, Hui and
Qin, Yong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.251/",
pages = "5103--5116",
ISBN = "979-8-89176-395-1",
abstract = "Automatic speech recognition (ASR) for children remains challenging due to developmental variability and the scarcity of high-quality corpora, especially for Mandarin and its dialects. In this paper, we present ChildTalk, a large-scale Chinese child speech corpus designed to address this gap. It contains 112.5 hours of speech from 498 children (aged 2{--}8) and 500 caregivers, recorded as natural child{--}caregiver conversations. Unlike prior Mandarin child ASR corpora that mainly release isolated utterances, ChildTalk provides full-length dialogues with complete transcriptions, preserving turn-taking and discourse context. To our knowledge, it is the first publicly available Mandarin child speech corpus with full-length dialogues and systematic coverage of standard Mandarin, eight Mandarin dialect subgroups, and two additional dialects (Southern Min and Jin). We benchmark end-to-end models trained from scratch, large pre-trained ASR models fine-tuned on ChildTalk, omni-modal LLMs in a zero-shot setting, and commercial speech transcription APIs. Fine-tuning on ChildTalk consistently improves both in-domain and cross-domain performance. These results indicate that ChildTalk provides a challenging, broad-coverage testbed for Chinese child ASR, dialect robustness, and dialogue-level modeling. The dataset will be made freely available for all academic purposes."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-etal-2026-childtalk">
<titleInfo>
<title>ChildTalk: A Multi-Dialect Chinese Child Speech Corpus with Full-Length Child–Caregiver Conversations for Speech Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiaming</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yujie</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiwan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianye</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoqin</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Automatic speech recognition (ASR) for children remains challenging due to developmental variability and the scarcity of high-quality corpora, especially for Mandarin and its dialects. In this paper, we present ChildTalk, a large-scale Chinese child speech corpus designed to address this gap. It contains 112.5 hours of speech from 498 children (aged 2–8) and 500 caregivers, recorded as natural child–caregiver conversations. Unlike prior Mandarin child ASR corpora that mainly release isolated utterances, ChildTalk provides full-length dialogues with complete transcriptions, preserving turn-taking and discourse context. To our knowledge, it is the first publicly available Mandarin child speech corpus with full-length dialogues and systematic coverage of standard Mandarin, eight Mandarin dialect subgroups, and two additional dialects (Southern Min and Jin). We benchmark end-to-end models trained from scratch, large pre-trained ASR models fine-tuned on ChildTalk, omni-modal LLMs in a zero-shot setting, and commercial speech transcription APIs. Fine-tuning on ChildTalk consistently improves both in-domain and cross-domain performance. These results indicate that ChildTalk provides a challenging, broad-coverage testbed for Chinese child ASR, dialect robustness, and dialogue-level modeling. The dataset will be made freely available for all academic purposes.</abstract>
<identifier type="citekey">zhou-etal-2026-childtalk</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.251/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>5103</start>
<end>5116</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ChildTalk: A Multi-Dialect Chinese Child Speech Corpus with Full-Length Child–Caregiver Conversations for Speech Recognition
%A Zhou, Jiaming
%A Guo, Yujie
%A Zhao, Shiwan
%A Lu, Yao
%A Wang, Jianye
%A Sun, Haoqin
%A Wang, Hui
%A Qin, Yong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhou-etal-2026-childtalk
%X Automatic speech recognition (ASR) for children remains challenging due to developmental variability and the scarcity of high-quality corpora, especially for Mandarin and its dialects. In this paper, we present ChildTalk, a large-scale Chinese child speech corpus designed to address this gap. It contains 112.5 hours of speech from 498 children (aged 2–8) and 500 caregivers, recorded as natural child–caregiver conversations. Unlike prior Mandarin child ASR corpora that mainly release isolated utterances, ChildTalk provides full-length dialogues with complete transcriptions, preserving turn-taking and discourse context. To our knowledge, it is the first publicly available Mandarin child speech corpus with full-length dialogues and systematic coverage of standard Mandarin, eight Mandarin dialect subgroups, and two additional dialects (Southern Min and Jin). We benchmark end-to-end models trained from scratch, large pre-trained ASR models fine-tuned on ChildTalk, omni-modal LLMs in a zero-shot setting, and commercial speech transcription APIs. Fine-tuning on ChildTalk consistently improves both in-domain and cross-domain performance. These results indicate that ChildTalk provides a challenging, broad-coverage testbed for Chinese child ASR, dialect robustness, and dialogue-level modeling. The dataset will be made freely available for all academic purposes.
%U https://aclanthology.org/2026.findings-acl.251/
%P 5103-5116
Markdown (Informal)
[ChildTalk: A Multi-Dialect Chinese Child Speech Corpus with Full-Length Child–Caregiver Conversations for Speech Recognition](https://aclanthology.org/2026.findings-acl.251/) (Zhou et al., Findings 2026)
ACL
- Jiaming Zhou, Yujie Guo, Shiwan Zhao, Yao Lu, Jianye Wang, Haoqin Sun, Hui Wang, and Yong Qin. 2026. ChildTalk: A Multi-Dialect Chinese Child Speech Corpus with Full-Length Child–Caregiver Conversations for Speech Recognition. In Findings of the Association for Computational Linguistics: ACL 2026, pages 5103–5116, San Diego, California, United States. Association for Computational Linguistics.