@inproceedings{zhou-yoshinaga-2025-tasc,
title = "A-{TASC}: {A}sian {TED}-Based Automatic Subtitling Corpus",
author = "Zhou, Yuhan and
Yoshinaga, Naoki",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.157/",
doi = "10.18653/v1/2025.acl-long.157",
pages = "3135--3148",
ISBN = "979-8-89176-251-0",
abstract = "Subtitles play a crucial role in improving the accessibility of the vast amount of audiovisual content available on the Internet, allowing audiences worldwide to comprehend and engage with this content in various languages. Automatic subtitling (AS) systems are essential for alleviating the substantial workload of human transcribers and translators. However, existing AS corpora and the primary metric SubER focus on European languages. This paper introduces A-TASC, an Asian TED-based automatic subtitling corpus derived from English TED Talks, comprising nearly 800 hours of audio segments, aligned English transcripts, and subtitles in Chinese, Japanese, Korean, and Vietnamese. We then present SacreSubER, a modification of SubER, to enable the reliable evaluation of subtitle quality for languages without explicit word boundaries. Experimental results, using both end-to-end systems and pipeline approaches built on strong ASR and LLM components, validate the quality of the proposed corpus and reveal differences in AS performance between European and Asian languages. The code to build our corpus is released."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhou-yoshinaga-2025-tasc">
<titleInfo>
<title>A-TASC: Asian TED-Based Automatic Subtitling Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuhan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoki</namePart>
<namePart type="family">Yoshinaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Subtitles play a crucial role in improving the accessibility of the vast amount of audiovisual content available on the Internet, allowing audiences worldwide to comprehend and engage with this content in various languages. Automatic subtitling (AS) systems are essential for alleviating the substantial workload of human transcribers and translators. However, existing AS corpora and the primary metric SubER focus on European languages. This paper introduces A-TASC, an Asian TED-based automatic subtitling corpus derived from English TED Talks, comprising nearly 800 hours of audio segments, aligned English transcripts, and subtitles in Chinese, Japanese, Korean, and Vietnamese. We then present SacreSubER, a modification of SubER, to enable the reliable evaluation of subtitle quality for languages without explicit word boundaries. Experimental results, using both end-to-end systems and pipeline approaches built on strong ASR and LLM components, validate the quality of the proposed corpus and reveal differences in AS performance between European and Asian languages. The code to build our corpus is released.</abstract>
<identifier type="citekey">zhou-yoshinaga-2025-tasc</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.157</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.157/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>3135</start>
<end>3148</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A-TASC: Asian TED-Based Automatic Subtitling Corpus
%A Zhou, Yuhan
%A Yoshinaga, Naoki
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F zhou-yoshinaga-2025-tasc
%X Subtitles play a crucial role in improving the accessibility of the vast amount of audiovisual content available on the Internet, allowing audiences worldwide to comprehend and engage with this content in various languages. Automatic subtitling (AS) systems are essential for alleviating the substantial workload of human transcribers and translators. However, existing AS corpora and the primary metric SubER focus on European languages. This paper introduces A-TASC, an Asian TED-based automatic subtitling corpus derived from English TED Talks, comprising nearly 800 hours of audio segments, aligned English transcripts, and subtitles in Chinese, Japanese, Korean, and Vietnamese. We then present SacreSubER, a modification of SubER, to enable the reliable evaluation of subtitle quality for languages without explicit word boundaries. Experimental results, using both end-to-end systems and pipeline approaches built on strong ASR and LLM components, validate the quality of the proposed corpus and reveal differences in AS performance between European and Asian languages. The code to build our corpus is released.
%R 10.18653/v1/2025.acl-long.157
%U https://aclanthology.org/2025.acl-long.157/
%U https://doi.org/10.18653/v1/2025.acl-long.157
%P 3135-3148
Markdown (Informal)
[A-TASC: Asian TED-Based Automatic Subtitling Corpus](https://aclanthology.org/2025.acl-long.157/) (Zhou & Yoshinaga, ACL 2025)
ACL
- Yuhan Zhou and Naoki Yoshinaga. 2025. A-TASC: Asian TED-Based Automatic Subtitling Corpus. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 3135–3148, Vienna, Austria. Association for Computational Linguistics.