@inproceedings{xiong-etal-2025-sa,
title = "{SA}-{DETR}:Span Aware Detection Transformer for Moment Retrieval",
author = "Xiong, Tianheng and
Wei, Wei and
Xu, Kaihe and
Chen, Dangyang",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.510/",
pages = "7634--7647",
abstract = "Moment Retrieval aims to locate specific video segments related to the given text. Recently, DETR-based methods, originating from Object Detection, have emerged as effective solutions for Moment Retrieval. These approaches focus on multimodal feature fusion and refining Queries composed of span anchor and content embedding. Despite the success, they often overlook the video-text instance related information in Query Initialization and the crucial guidance role of span anchors in Query Refinement, leading to inaccurate predictions. To address this, we propose a novel \textbf{S}pan \textbf{A}ware \textbf{DE}tection \textbf{TR}ansformer (SA-DETR) that leverages the importance of instance related span anchors. To fully leverage the instance related information, we generate span anchors based on video-text pair rather than using learnable parameters, as is common in conventional DETR-based methods, and supervise them with GT labels. To effectively exploit the correspondence between span anchors and video clips, we enhance content embedding guided by textual features and generate Gaussian mask to modulate the interaction between content embedding and fusion features. Furthermore, we explore the feature alignment across various stages and granularities and apply denoise learning to boost the span awareness of the model. Extensive experiments on QVHighlights, Charades-STA, and TACoS demonstrate the effectiveness of our approach."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xiong-etal-2025-sa">
<titleInfo>
<title>SA-DETR:Span Aware Detection Transformer for Moment Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianheng</namePart>
<namePart type="family">Xiong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaihe</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dangyang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Moment Retrieval aims to locate specific video segments related to the given text. Recently, DETR-based methods, originating from Object Detection, have emerged as effective solutions for Moment Retrieval. These approaches focus on multimodal feature fusion and refining Queries composed of span anchor and content embedding. Despite the success, they often overlook the video-text instance related information in Query Initialization and the crucial guidance role of span anchors in Query Refinement, leading to inaccurate predictions. To address this, we propose a novel Span Aware DEtection TRansformer (SA-DETR) that leverages the importance of instance related span anchors. To fully leverage the instance related information, we generate span anchors based on video-text pair rather than using learnable parameters, as is common in conventional DETR-based methods, and supervise them with GT labels. To effectively exploit the correspondence between span anchors and video clips, we enhance content embedding guided by textual features and generate Gaussian mask to modulate the interaction between content embedding and fusion features. Furthermore, we explore the feature alignment across various stages and granularities and apply denoise learning to boost the span awareness of the model. Extensive experiments on QVHighlights, Charades-STA, and TACoS demonstrate the effectiveness of our approach.</abstract>
<identifier type="citekey">xiong-etal-2025-sa</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.510/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>7634</start>
<end>7647</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SA-DETR:Span Aware Detection Transformer for Moment Retrieval
%A Xiong, Tianheng
%A Wei, Wei
%A Xu, Kaihe
%A Chen, Dangyang
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F xiong-etal-2025-sa
%X Moment Retrieval aims to locate specific video segments related to the given text. Recently, DETR-based methods, originating from Object Detection, have emerged as effective solutions for Moment Retrieval. These approaches focus on multimodal feature fusion and refining Queries composed of span anchor and content embedding. Despite the success, they often overlook the video-text instance related information in Query Initialization and the crucial guidance role of span anchors in Query Refinement, leading to inaccurate predictions. To address this, we propose a novel Span Aware DEtection TRansformer (SA-DETR) that leverages the importance of instance related span anchors. To fully leverage the instance related information, we generate span anchors based on video-text pair rather than using learnable parameters, as is common in conventional DETR-based methods, and supervise them with GT labels. To effectively exploit the correspondence between span anchors and video clips, we enhance content embedding guided by textual features and generate Gaussian mask to modulate the interaction between content embedding and fusion features. Furthermore, we explore the feature alignment across various stages and granularities and apply denoise learning to boost the span awareness of the model. Extensive experiments on QVHighlights, Charades-STA, and TACoS demonstrate the effectiveness of our approach.
%U https://aclanthology.org/2025.coling-main.510/
%P 7634-7647
Markdown (Informal)
[SA-DETR:Span Aware Detection Transformer for Moment Retrieval](https://aclanthology.org/2025.coling-main.510/) (Xiong et al., COLING 2025)
ACL