@inproceedings{rodriguez-opazo-etal-2023-memory,
title = "Memory-efficient Temporal Moment Localization in Long Videos",
author = "Rodriguez-Opazo, Cristian and
Marrese-Taylor, Edison and
Fernando, Basura and
Takamura, Hiroya and
Wu, Qi",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eacl-main.140",
doi = "10.18653/v1/2023.eacl-main.140",
pages = "1909--1924",
abstract = "Temporal Moment Localization is a challenging multi-modal task which aims to identify the start and end timestamps of a moment of interest in an input untrimmed video, given a query in natural language. Solving this task correctly requires understanding the temporal relationships in the entire input video, but processing such long inputs and reasoning about them is memory and computationally expensive. In light of this issue, we propose Stochastic Bucket-wise Feature Sampling (SBFS), a stochastic sampling module that allows methods to process long videos at a constant memory footprint. We further combine SBFS with a new consistency loss to propose Locformer, a Transformer-based model that can process videos as long as 18 minutes. We test our proposals on relevant benchmark datasets, showing that not only can Locformer achieve excellent results, but also that our sampling is more effective than competing counterparts. Concretely, SBFS consistently improves the performance of prior work, by up to 3.13{\%} in the mean temporal IoU, leading to a new state-of-the-art performance on Charades-STA and YouCookII, while also obtaining up to 12.8x speed-up at testing time and reducing memory requirements by up to 5x.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rodriguez-opazo-etal-2023-memory">
<titleInfo>
<title>Memory-efficient Temporal Moment Localization in Long Videos</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cristian</namePart>
<namePart type="family">Rodriguez-Opazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edison</namePart>
<namePart type="family">Marrese-Taylor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Basura</namePart>
<namePart type="family">Fernando</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Temporal Moment Localization is a challenging multi-modal task which aims to identify the start and end timestamps of a moment of interest in an input untrimmed video, given a query in natural language. Solving this task correctly requires understanding the temporal relationships in the entire input video, but processing such long inputs and reasoning about them is memory and computationally expensive. In light of this issue, we propose Stochastic Bucket-wise Feature Sampling (SBFS), a stochastic sampling module that allows methods to process long videos at a constant memory footprint. We further combine SBFS with a new consistency loss to propose Locformer, a Transformer-based model that can process videos as long as 18 minutes. We test our proposals on relevant benchmark datasets, showing that not only can Locformer achieve excellent results, but also that our sampling is more effective than competing counterparts. Concretely, SBFS consistently improves the performance of prior work, by up to 3.13% in the mean temporal IoU, leading to a new state-of-the-art performance on Charades-STA and YouCookII, while also obtaining up to 12.8x speed-up at testing time and reducing memory requirements by up to 5x.</abstract>
<identifier type="citekey">rodriguez-opazo-etal-2023-memory</identifier>
<identifier type="doi">10.18653/v1/2023.eacl-main.140</identifier>
<location>
<url>https://aclanthology.org/2023.eacl-main.140</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1909</start>
<end>1924</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Memory-efficient Temporal Moment Localization in Long Videos
%A Rodriguez-Opazo, Cristian
%A Marrese-Taylor, Edison
%A Fernando, Basura
%A Takamura, Hiroya
%A Wu, Qi
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F rodriguez-opazo-etal-2023-memory
%X Temporal Moment Localization is a challenging multi-modal task which aims to identify the start and end timestamps of a moment of interest in an input untrimmed video, given a query in natural language. Solving this task correctly requires understanding the temporal relationships in the entire input video, but processing such long inputs and reasoning about them is memory and computationally expensive. In light of this issue, we propose Stochastic Bucket-wise Feature Sampling (SBFS), a stochastic sampling module that allows methods to process long videos at a constant memory footprint. We further combine SBFS with a new consistency loss to propose Locformer, a Transformer-based model that can process videos as long as 18 minutes. We test our proposals on relevant benchmark datasets, showing that not only can Locformer achieve excellent results, but also that our sampling is more effective than competing counterparts. Concretely, SBFS consistently improves the performance of prior work, by up to 3.13% in the mean temporal IoU, leading to a new state-of-the-art performance on Charades-STA and YouCookII, while also obtaining up to 12.8x speed-up at testing time and reducing memory requirements by up to 5x.
%R 10.18653/v1/2023.eacl-main.140
%U https://aclanthology.org/2023.eacl-main.140
%U https://doi.org/10.18653/v1/2023.eacl-main.140
%P 1909-1924
Markdown (Informal)
[Memory-efficient Temporal Moment Localization in Long Videos](https://aclanthology.org/2023.eacl-main.140) (Rodriguez-Opazo et al., EACL 2023)
ACL
- Cristian Rodriguez-Opazo, Edison Marrese-Taylor, Basura Fernando, Hiroya Takamura, and Qi Wu. 2023. Memory-efficient Temporal Moment Localization in Long Videos. In Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics, pages 1909–1924, Dubrovnik, Croatia. Association for Computational Linguistics.