@inproceedings{fang-etal-2026-stable,
title = "Stable Signer: Hierarchical Sign Language Generative Model",
author = "Fang, Sen and
Feng, Yalin and
Zhong, Hongbin and
Zhang, Yanxin and
Metaxas, Dimitris N.",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.659/",
pages = "14472--14483",
ISBN = "979-8-89176-390-6",
abstract = "Sign Language Production (SLP) is the process of converting the complex input text into a real video. Most previous works focused on the Text2Gloss, Gloss2Pose, Pose2Vid stages, and some concentrated on Prompt2Gloss and Text2Avatar stages. However, this field has made slow progress due to the inaccuracy of text conversion, pose generation, and the rendering of poses into real human videos in these stages, resulting in gradually accumulating errors. Therefore, in this paper, we streamline the traditional redundant structure, simplify and optimize the task objective, and design a new sign language generative model called **Stable Signer**. It redefines the SLP task as a hierarchical generation end-to-end task that only includes text understanding (Prompt2Gloss, Text2Gloss) and Pose2Vid, and executes text understanding through our proposed new **S**ign **L**anguage **U**nderstanding **L**inker called **SLUL**, and generates hand gestures through the named **SLP-MoE** hand gesture rendering expert block to end-to-end generate high-quality and multi-style sign language videos. SLUL is trained using the newly developed **S**emantic-**A**ware **G**loss **M**asking Loss (**SAGM Loss**). Its performance has improved by 48.6{\%} compared to the current SOTA generation methods, which is a significant increase in the SLP field. More demo can be obtained at [anonymous url](https://anonymoussubmissionurl.github.io/Stable-Signer)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fang-etal-2026-stable">
<titleInfo>
<title>Stable Signer: Hierarchical Sign Language Generative Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sen</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yalin</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongbin</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanxin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitris</namePart>
<namePart type="given">N</namePart>
<namePart type="family">Metaxas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Sign Language Production (SLP) is the process of converting the complex input text into a real video. Most previous works focused on the Text2Gloss, Gloss2Pose, Pose2Vid stages, and some concentrated on Prompt2Gloss and Text2Avatar stages. However, this field has made slow progress due to the inaccuracy of text conversion, pose generation, and the rendering of poses into real human videos in these stages, resulting in gradually accumulating errors. Therefore, in this paper, we streamline the traditional redundant structure, simplify and optimize the task objective, and design a new sign language generative model called **Stable Signer**. It redefines the SLP task as a hierarchical generation end-to-end task that only includes text understanding (Prompt2Gloss, Text2Gloss) and Pose2Vid, and executes text understanding through our proposed new **S**ign **L**anguage **U**nderstanding **L**inker called **SLUL**, and generates hand gestures through the named **SLP-MoE** hand gesture rendering expert block to end-to-end generate high-quality and multi-style sign language videos. SLUL is trained using the newly developed **S**emantic-**A**ware **G**loss **M**asking Loss (**SAGM Loss**). Its performance has improved by 48.6% compared to the current SOTA generation methods, which is a significant increase in the SLP field. More demo can be obtained at [anonymous url](https://anonymoussubmissionurl.github.io/Stable-Signer).</abstract>
<identifier type="citekey">fang-etal-2026-stable</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.659/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>14472</start>
<end>14483</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Stable Signer: Hierarchical Sign Language Generative Model
%A Fang, Sen
%A Feng, Yalin
%A Zhong, Hongbin
%A Zhang, Yanxin
%A Metaxas, Dimitris N.
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F fang-etal-2026-stable
%X Sign Language Production (SLP) is the process of converting the complex input text into a real video. Most previous works focused on the Text2Gloss, Gloss2Pose, Pose2Vid stages, and some concentrated on Prompt2Gloss and Text2Avatar stages. However, this field has made slow progress due to the inaccuracy of text conversion, pose generation, and the rendering of poses into real human videos in these stages, resulting in gradually accumulating errors. Therefore, in this paper, we streamline the traditional redundant structure, simplify and optimize the task objective, and design a new sign language generative model called **Stable Signer**. It redefines the SLP task as a hierarchical generation end-to-end task that only includes text understanding (Prompt2Gloss, Text2Gloss) and Pose2Vid, and executes text understanding through our proposed new **S**ign **L**anguage **U**nderstanding **L**inker called **SLUL**, and generates hand gestures through the named **SLP-MoE** hand gesture rendering expert block to end-to-end generate high-quality and multi-style sign language videos. SLUL is trained using the newly developed **S**emantic-**A**ware **G**loss **M**asking Loss (**SAGM Loss**). Its performance has improved by 48.6% compared to the current SOTA generation methods, which is a significant increase in the SLP field. More demo can be obtained at [anonymous url](https://anonymoussubmissionurl.github.io/Stable-Signer).
%U https://aclanthology.org/2026.acl-long.659/
%P 14472-14483
Markdown (Informal)
[Stable Signer: Hierarchical Sign Language Generative Model](https://aclanthology.org/2026.acl-long.659/) (Fang et al., ACL 2026)
ACL
- Sen Fang, Yalin Feng, Hongbin Zhong, Yanxin Zhang, and Dimitris N. Metaxas. 2026. Stable Signer: Hierarchical Sign Language Generative Model. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 14472–14483, San Diego, California, United States. Association for Computational Linguistics.