@inproceedings{bianco-etal-2025-augmenting,
title = "Augmenting Sign Language Translation Datasets with Large Language Models",
author = "Bianco, Pedro Alejandro Dal and
Reinhold, Jean Paul Nunes and
Quiroga, Facundo Manuel and
Ronchetti, Franco",
editor = "Hasanuzzaman, Mohammed and
Quiroga, Facundo Manuel and
Modi, Ashutosh and
Kamila, Sabyasachi and
Artiaga, Keren and
Joshi, Abhinav and
Singh, Sanjeet",
booktitle = "Proceedings of the Workshop on Sign Language Processing (WSLP)",
month = dec,
year = "2025",
address = "IIT Bombay, Mumbai, India (Co-located with IJCNLP{--}AACL 2025)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wslp-main.4/",
pages = "20--26",
ISBN = "979-8-89176-304-3",
abstract = "Sign language translation (SLT) is a challenging task due to the scarcity of labeled data and the heavy-tailed distribution of sign language vocabularies. In this paper, we explore a novel data augmentation approach for SLT: using a large language model (LLM) to generate paraphrases of the target language sentences in the training data. We experiment with a Transformer-based SLT model (Signformer) on three datasets spanning German, Greek, and Argentinian Sign Languages. For models trained with augmentation, we adopt a two-stage regime: pre-train on the LLM-augmented corpus and then fine-tune on the original, non-augmented training set. Our augmented training sets, expanded with GPT-4-generated paraphrases, yield mixed results. On a medium-scale German SL corpus (PHOENIX14T), LLM augmentation improves BLEU-4 from 9.56 to 10.33. In contrast, a small-vocabulary Greek SL dataset with a near-perfect baseline (94.38 BLEU) sees a slight drop to 92.22 BLEU, and a complex Argentinian SL corpus with a long-tail vocabulary distribution remains around 1.2 BLEU despite augmentation. We analyze these outcomes in relation to each dataset{'}s complexity and token frequency distribution, finding that LLM-based augmentation is more beneficial when the dataset contains a richer vocabulary and many infrequent tokens. To our knowledge, this work is the first to apply LLM paraphrasing to SLT, and we discuss these results with respect to prior data augmentation efforts in sign language translation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bianco-etal-2025-augmenting">
<titleInfo>
<title>Augmenting Sign Language Translation Datasets with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="given">Alejandro</namePart>
<namePart type="given">Dal</namePart>
<namePart type="family">Bianco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean</namePart>
<namePart type="given">Paul</namePart>
<namePart type="given">Nunes</namePart>
<namePart type="family">Reinhold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Facundo</namePart>
<namePart type="given">Manuel</namePart>
<namePart type="family">Quiroga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franco</namePart>
<namePart type="family">Ronchetti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Sign Language Processing (WSLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="family">Hasanuzzaman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Facundo</namePart>
<namePart type="given">Manuel</namePart>
<namePart type="family">Quiroga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashutosh</namePart>
<namePart type="family">Modi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabyasachi</namePart>
<namePart type="family">Kamila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keren</namePart>
<namePart type="family">Artiaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhinav</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanjeet</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">IIT Bombay, Mumbai, India (Co-located with IJCNLP–AACL 2025)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-304-3</identifier>
</relatedItem>
<abstract>Sign language translation (SLT) is a challenging task due to the scarcity of labeled data and the heavy-tailed distribution of sign language vocabularies. In this paper, we explore a novel data augmentation approach for SLT: using a large language model (LLM) to generate paraphrases of the target language sentences in the training data. We experiment with a Transformer-based SLT model (Signformer) on three datasets spanning German, Greek, and Argentinian Sign Languages. For models trained with augmentation, we adopt a two-stage regime: pre-train on the LLM-augmented corpus and then fine-tune on the original, non-augmented training set. Our augmented training sets, expanded with GPT-4-generated paraphrases, yield mixed results. On a medium-scale German SL corpus (PHOENIX14T), LLM augmentation improves BLEU-4 from 9.56 to 10.33. In contrast, a small-vocabulary Greek SL dataset with a near-perfect baseline (94.38 BLEU) sees a slight drop to 92.22 BLEU, and a complex Argentinian SL corpus with a long-tail vocabulary distribution remains around 1.2 BLEU despite augmentation. We analyze these outcomes in relation to each dataset’s complexity and token frequency distribution, finding that LLM-based augmentation is more beneficial when the dataset contains a richer vocabulary and many infrequent tokens. To our knowledge, this work is the first to apply LLM paraphrasing to SLT, and we discuss these results with respect to prior data augmentation efforts in sign language translation.</abstract>
<identifier type="citekey">bianco-etal-2025-augmenting</identifier>
<location>
<url>https://aclanthology.org/2025.wslp-main.4/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>20</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Augmenting Sign Language Translation Datasets with Large Language Models
%A Bianco, Pedro Alejandro Dal
%A Reinhold, Jean Paul Nunes
%A Quiroga, Facundo Manuel
%A Ronchetti, Franco
%Y Hasanuzzaman, Mohammed
%Y Quiroga, Facundo Manuel
%Y Modi, Ashutosh
%Y Kamila, Sabyasachi
%Y Artiaga, Keren
%Y Joshi, Abhinav
%Y Singh, Sanjeet
%S Proceedings of the Workshop on Sign Language Processing (WSLP)
%D 2025
%8 December
%I Association for Computational Linguistics
%C IIT Bombay, Mumbai, India (Co-located with IJCNLP–AACL 2025)
%@ 979-8-89176-304-3
%F bianco-etal-2025-augmenting
%X Sign language translation (SLT) is a challenging task due to the scarcity of labeled data and the heavy-tailed distribution of sign language vocabularies. In this paper, we explore a novel data augmentation approach for SLT: using a large language model (LLM) to generate paraphrases of the target language sentences in the training data. We experiment with a Transformer-based SLT model (Signformer) on three datasets spanning German, Greek, and Argentinian Sign Languages. For models trained with augmentation, we adopt a two-stage regime: pre-train on the LLM-augmented corpus and then fine-tune on the original, non-augmented training set. Our augmented training sets, expanded with GPT-4-generated paraphrases, yield mixed results. On a medium-scale German SL corpus (PHOENIX14T), LLM augmentation improves BLEU-4 from 9.56 to 10.33. In contrast, a small-vocabulary Greek SL dataset with a near-perfect baseline (94.38 BLEU) sees a slight drop to 92.22 BLEU, and a complex Argentinian SL corpus with a long-tail vocabulary distribution remains around 1.2 BLEU despite augmentation. We analyze these outcomes in relation to each dataset’s complexity and token frequency distribution, finding that LLM-based augmentation is more beneficial when the dataset contains a richer vocabulary and many infrequent tokens. To our knowledge, this work is the first to apply LLM paraphrasing to SLT, and we discuss these results with respect to prior data augmentation efforts in sign language translation.
%U https://aclanthology.org/2025.wslp-main.4/
%P 20-26
Markdown (Informal)
[Augmenting Sign Language Translation Datasets with Large Language Models](https://aclanthology.org/2025.wslp-main.4/) (Bianco et al., WSLP 2025)
ACL
- Pedro Alejandro Dal Bianco, Jean Paul Nunes Reinhold, Facundo Manuel Quiroga, and Franco Ronchetti. 2025. Augmenting Sign Language Translation Datasets with Large Language Models. In Proceedings of the Workshop on Sign Language Processing (WSLP), pages 20–26, IIT Bombay, Mumbai, India (Co-located with IJCNLP–AACL 2025). Association for Computational Linguistics.