@inproceedings{mo-etal-2026-j,
title = "{J}-Shuwa: A Large-Scale Web-Collected {J}apanese {S}ign {L}anguage-{J}apanese Parallel Corpus",
author = "Mo, Junwen and
Vo, MinhDuc and
Nishida, Noriki and
Satoh, Shin{'}ichi and
Nakayama, Hideki",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1821/",
pages = "36559--36574",
ISBN = "979-8-89176-395-1",
abstract = "Japanese Sign Language (JSL) is a low-resource sign language that has received limited attention in the AI research community, primarily due to the lack of large-scale, publicly available parallel corpora. In this work, we introduce J-Shuwa, a large-scale JSL-Japanese parallel corpus constructed from YouTube videos with hard-coded subtitles and closed captions. The corpus contains 197K parallel JSL-Japanese sentence pairs, totaling approximately 300 hours of video, making it the largest publicly available JSL dataset to date. We conduct sign language translation (SLT) experiments by training models on J-Shuwa and evaluating them on the JSL Dialogue Corpus under both zero-shot and fine-tuned settings. Our results demonstrate that J-Shuwa is effective for training SLT models. Beyond SLT, we believe that J-Shuwa can also serve as a valuable resource for future JSL research across a wide range of tasks. The dataset and code are publicly available at: https://github.com/SpaJune/J-Shuwa."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mo-etal-2026-j">
<titleInfo>
<title>J-Shuwa: A Large-Scale Web-Collected Japanese Sign Language-Japanese Parallel Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junwen</namePart>
<namePart type="family">Mo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">MinhDuc</namePart>
<namePart type="family">Vo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noriki</namePart>
<namePart type="family">Nishida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shin’ichi</namePart>
<namePart type="family">Satoh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hideki</namePart>
<namePart type="family">Nakayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Japanese Sign Language (JSL) is a low-resource sign language that has received limited attention in the AI research community, primarily due to the lack of large-scale, publicly available parallel corpora. In this work, we introduce J-Shuwa, a large-scale JSL-Japanese parallel corpus constructed from YouTube videos with hard-coded subtitles and closed captions. The corpus contains 197K parallel JSL-Japanese sentence pairs, totaling approximately 300 hours of video, making it the largest publicly available JSL dataset to date. We conduct sign language translation (SLT) experiments by training models on J-Shuwa and evaluating them on the JSL Dialogue Corpus under both zero-shot and fine-tuned settings. Our results demonstrate that J-Shuwa is effective for training SLT models. Beyond SLT, we believe that J-Shuwa can also serve as a valuable resource for future JSL research across a wide range of tasks. The dataset and code are publicly available at: https://github.com/SpaJune/J-Shuwa.</abstract>
<identifier type="citekey">mo-etal-2026-j</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1821/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>36559</start>
<end>36574</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T J-Shuwa: A Large-Scale Web-Collected Japanese Sign Language-Japanese Parallel Corpus
%A Mo, Junwen
%A Vo, MinhDuc
%A Nishida, Noriki
%A Satoh, Shin’ichi
%A Nakayama, Hideki
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F mo-etal-2026-j
%X Japanese Sign Language (JSL) is a low-resource sign language that has received limited attention in the AI research community, primarily due to the lack of large-scale, publicly available parallel corpora. In this work, we introduce J-Shuwa, a large-scale JSL-Japanese parallel corpus constructed from YouTube videos with hard-coded subtitles and closed captions. The corpus contains 197K parallel JSL-Japanese sentence pairs, totaling approximately 300 hours of video, making it the largest publicly available JSL dataset to date. We conduct sign language translation (SLT) experiments by training models on J-Shuwa and evaluating them on the JSL Dialogue Corpus under both zero-shot and fine-tuned settings. Our results demonstrate that J-Shuwa is effective for training SLT models. Beyond SLT, we believe that J-Shuwa can also serve as a valuable resource for future JSL research across a wide range of tasks. The dataset and code are publicly available at: https://github.com/SpaJune/J-Shuwa.
%U https://aclanthology.org/2026.findings-acl.1821/
%P 36559-36574
Markdown (Informal)
[J-Shuwa: A Large-Scale Web-Collected Japanese Sign Language-Japanese Parallel Corpus](https://aclanthology.org/2026.findings-acl.1821/) (Mo et al., Findings 2026)
ACL