@inproceedings{ma-etal-2024-multi,
title = "Multi-Channel Spatio-Temporal Transformer for Sign Language Production",
author = "Ma, Xiaohan and
Jin, Rize and
Chung, Tae-Sun",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1022",
pages = "11699--11712",
abstract = "The task of Sign Language Production (SLP) in machine learning involves converting text-based spoken language into corresponding sign language expressions. Sign language conveys meaning through the continuous movement of multiple articulators, including manual and non-manual channels. However, most current Transformer-based SLP models convert these multi-channel sign poses into a unified feature representation, ignoring the inherent structural correlations between channels. This paper introduces a novel approach called MCST-Transformer for skeletal sign language production. It employs multi-channel spatial attention to capture correlations across various channels within each frame, and temporal attention to learn sequential dependencies for each channel over time. Additionally, the paper explores and experiments with multiple fusion techniques to combine the spatial and temporal representations into naturalistic sign sequences. To validate the effectiveness of the proposed MCST-Transformer model and its constituent components, extensive experiments were conducted on two benchmark sign language datasets from diverse cultures. The results demonstrate that this new approach outperforms state-of-the-art models on both datasets.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ma-etal-2024-multi">
<titleInfo>
<title>Multi-Channel Spatio-Temporal Transformer for Sign Language Production</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaohan</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rize</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tae-Sun</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The task of Sign Language Production (SLP) in machine learning involves converting text-based spoken language into corresponding sign language expressions. Sign language conveys meaning through the continuous movement of multiple articulators, including manual and non-manual channels. However, most current Transformer-based SLP models convert these multi-channel sign poses into a unified feature representation, ignoring the inherent structural correlations between channels. This paper introduces a novel approach called MCST-Transformer for skeletal sign language production. It employs multi-channel spatial attention to capture correlations across various channels within each frame, and temporal attention to learn sequential dependencies for each channel over time. Additionally, the paper explores and experiments with multiple fusion techniques to combine the spatial and temporal representations into naturalistic sign sequences. To validate the effectiveness of the proposed MCST-Transformer model and its constituent components, extensive experiments were conducted on two benchmark sign language datasets from diverse cultures. The results demonstrate that this new approach outperforms state-of-the-art models on both datasets.</abstract>
<identifier type="citekey">ma-etal-2024-multi</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1022</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>11699</start>
<end>11712</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-Channel Spatio-Temporal Transformer for Sign Language Production
%A Ma, Xiaohan
%A Jin, Rize
%A Chung, Tae-Sun
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F ma-etal-2024-multi
%X The task of Sign Language Production (SLP) in machine learning involves converting text-based spoken language into corresponding sign language expressions. Sign language conveys meaning through the continuous movement of multiple articulators, including manual and non-manual channels. However, most current Transformer-based SLP models convert these multi-channel sign poses into a unified feature representation, ignoring the inherent structural correlations between channels. This paper introduces a novel approach called MCST-Transformer for skeletal sign language production. It employs multi-channel spatial attention to capture correlations across various channels within each frame, and temporal attention to learn sequential dependencies for each channel over time. Additionally, the paper explores and experiments with multiple fusion techniques to combine the spatial and temporal representations into naturalistic sign sequences. To validate the effectiveness of the proposed MCST-Transformer model and its constituent components, extensive experiments were conducted on two benchmark sign language datasets from diverse cultures. The results demonstrate that this new approach outperforms state-of-the-art models on both datasets.
%U https://aclanthology.org/2024.lrec-main.1022
%P 11699-11712
Markdown (Informal)
[Multi-Channel Spatio-Temporal Transformer for Sign Language Production](https://aclanthology.org/2024.lrec-main.1022) (Ma et al., LREC-COLING 2024)
ACL