@inproceedings{johnny-etal-2025-pose,
title = "Pose-Based Sign Language Spotting via an End-to-End Encoder Architecture",
author = "Johnny, Samuel Ebimobowei and
Guda, Blessed and
Aaron, Emmanuel and
Gueye, Assane",
editor = "Hasanuzzaman, Mohammed and
Quiroga, Facundo Manuel and
Modi, Ashutosh and
Kamila, Sabyasachi and
Artiaga, Keren and
Joshi, Abhinav and
Singh, Sanjeet",
booktitle = "Proceedings of the Workshop on Sign Language Processing (WSLP)",
month = dec,
year = "2025",
address = "IIT Bombay, Mumbai, India (Co-located with IJCNLP{--}AACL 2025)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wslp-main.10/",
pages = "67--72",
ISBN = "979-8-89176-304-3",
abstract = "Automatic Sign Language Recognition (ASLR) has emerged as a vital field for bridging the gap between deaf and hearing communities. However, the problem of sign-to-sign retrieval or detecting a specific sign within a sequence of continuous signs remains largely unexplored. We define this novel task as Sign Language Spotting. In this paper, we present a first step toward sign language retrieval by addressing the challenge of detecting the presence or absence of a query sign video within a sentence-level gloss or sign video. Unlike conventional approaches that rely on intermediate gloss recognition or text-based matching, we propose an end-to-end model that directly operates on pose keypoints extracted from sign videos. Our architecture employs an encoder-only backbone with a binary classification head to determine whether the query sign appears within the target sequence. By focusing on pose representations instead of raw RGB frames, our method significantly reduces computational cost and mitigates visual noise. We evaluate our approach on the Word Presence Prediction dataset from the WSLP 2025 shared task, achieving 61.88{\%} accuracy and 60.00{\%} F1-score. These results demonstrate the effectiveness of our pose-based framework for Sign Language Spotting, establishing a strong foundation for future research in automatic sign language retrieval and verification."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="johnny-etal-2025-pose">
<titleInfo>
<title>Pose-Based Sign Language Spotting via an End-to-End Encoder Architecture</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="given">Ebimobowei</namePart>
<namePart type="family">Johnny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Blessed</namePart>
<namePart type="family">Guda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Aaron</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Assane</namePart>
<namePart type="family">Gueye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Sign Language Processing (WSLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="family">Hasanuzzaman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Facundo</namePart>
<namePart type="given">Manuel</namePart>
<namePart type="family">Quiroga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashutosh</namePart>
<namePart type="family">Modi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sabyasachi</namePart>
<namePart type="family">Kamila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keren</namePart>
<namePart type="family">Artiaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhinav</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanjeet</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">IIT Bombay, Mumbai, India (Co-located with IJCNLP–AACL 2025)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-304-3</identifier>
</relatedItem>
<abstract>Automatic Sign Language Recognition (ASLR) has emerged as a vital field for bridging the gap between deaf and hearing communities. However, the problem of sign-to-sign retrieval or detecting a specific sign within a sequence of continuous signs remains largely unexplored. We define this novel task as Sign Language Spotting. In this paper, we present a first step toward sign language retrieval by addressing the challenge of detecting the presence or absence of a query sign video within a sentence-level gloss or sign video. Unlike conventional approaches that rely on intermediate gloss recognition or text-based matching, we propose an end-to-end model that directly operates on pose keypoints extracted from sign videos. Our architecture employs an encoder-only backbone with a binary classification head to determine whether the query sign appears within the target sequence. By focusing on pose representations instead of raw RGB frames, our method significantly reduces computational cost and mitigates visual noise. We evaluate our approach on the Word Presence Prediction dataset from the WSLP 2025 shared task, achieving 61.88% accuracy and 60.00% F1-score. These results demonstrate the effectiveness of our pose-based framework for Sign Language Spotting, establishing a strong foundation for future research in automatic sign language retrieval and verification.</abstract>
<identifier type="citekey">johnny-etal-2025-pose</identifier>
<location>
<url>https://aclanthology.org/2025.wslp-main.10/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>67</start>
<end>72</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pose-Based Sign Language Spotting via an End-to-End Encoder Architecture
%A Johnny, Samuel Ebimobowei
%A Guda, Blessed
%A Aaron, Emmanuel
%A Gueye, Assane
%Y Hasanuzzaman, Mohammed
%Y Quiroga, Facundo Manuel
%Y Modi, Ashutosh
%Y Kamila, Sabyasachi
%Y Artiaga, Keren
%Y Joshi, Abhinav
%Y Singh, Sanjeet
%S Proceedings of the Workshop on Sign Language Processing (WSLP)
%D 2025
%8 December
%I Association for Computational Linguistics
%C IIT Bombay, Mumbai, India (Co-located with IJCNLP–AACL 2025)
%@ 979-8-89176-304-3
%F johnny-etal-2025-pose
%X Automatic Sign Language Recognition (ASLR) has emerged as a vital field for bridging the gap between deaf and hearing communities. However, the problem of sign-to-sign retrieval or detecting a specific sign within a sequence of continuous signs remains largely unexplored. We define this novel task as Sign Language Spotting. In this paper, we present a first step toward sign language retrieval by addressing the challenge of detecting the presence or absence of a query sign video within a sentence-level gloss or sign video. Unlike conventional approaches that rely on intermediate gloss recognition or text-based matching, we propose an end-to-end model that directly operates on pose keypoints extracted from sign videos. Our architecture employs an encoder-only backbone with a binary classification head to determine whether the query sign appears within the target sequence. By focusing on pose representations instead of raw RGB frames, our method significantly reduces computational cost and mitigates visual noise. We evaluate our approach on the Word Presence Prediction dataset from the WSLP 2025 shared task, achieving 61.88% accuracy and 60.00% F1-score. These results demonstrate the effectiveness of our pose-based framework for Sign Language Spotting, establishing a strong foundation for future research in automatic sign language retrieval and verification.
%U https://aclanthology.org/2025.wslp-main.10/
%P 67-72
Markdown (Informal)
[Pose-Based Sign Language Spotting via an End-to-End Encoder Architecture](https://aclanthology.org/2025.wslp-main.10/) (Johnny et al., WSLP 2025)
ACL