@inproceedings{li-etal-2025-mitigating,
title = "Mitigating the Discrepancy Between Video and Text Temporal Sequences: A Time-Perception Enhanced Video Grounding method for {LLM}",
author = "Li, Xuefen and
Wang, Bo and
Shi, Ge and
Feng, Chong and
Teng, Jiahao",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.655/",
pages = "9804--9813",
abstract = "Existing video LLMs typically excel at capturing the overall description of a video but lack the ability to demonstrate an understanding of temporal dynamics and a fine-grained grasp of localized content within the video. In this paper, we propose a Time-Perception Enhanced Video Grounding via Boundary Perception and Temporal Reasoning aimed at mitigating LLMs' difficulties in understanding the discrepancies between video and text temporality. Specifically, to address the inherent biases in current datasets, we design a series of boundary-perception tasks to enable LLMs to capture accurate video temporality. To tackle LLMs' insufficient understanding of temporal information, we develop specialized tasks for boundary perception and temporal relationship reasoning to deepen LLMs' perception of video temporality. Our experimental results show significant improvements across three datasets: ActivityNet, Charades, and DiDeMo (achieving up to 11.2{\%} improvement on R@0.3), demonstrating the effectiveness of our proposed temporal awareness-enhanced data construction method."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-mitigating">
<titleInfo>
<title>Mitigating the Discrepancy Between Video and Text Temporal Sequences: A Time-Perception Enhanced Video Grounding method for LLM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuefen</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ge</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chong</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahao</namePart>
<namePart type="family">Teng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Existing video LLMs typically excel at capturing the overall description of a video but lack the ability to demonstrate an understanding of temporal dynamics and a fine-grained grasp of localized content within the video. In this paper, we propose a Time-Perception Enhanced Video Grounding via Boundary Perception and Temporal Reasoning aimed at mitigating LLMs’ difficulties in understanding the discrepancies between video and text temporality. Specifically, to address the inherent biases in current datasets, we design a series of boundary-perception tasks to enable LLMs to capture accurate video temporality. To tackle LLMs’ insufficient understanding of temporal information, we develop specialized tasks for boundary perception and temporal relationship reasoning to deepen LLMs’ perception of video temporality. Our experimental results show significant improvements across three datasets: ActivityNet, Charades, and DiDeMo (achieving up to 11.2% improvement on R@0.3), demonstrating the effectiveness of our proposed temporal awareness-enhanced data construction method.</abstract>
<identifier type="citekey">li-etal-2025-mitigating</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.655/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>9804</start>
<end>9813</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mitigating the Discrepancy Between Video and Text Temporal Sequences: A Time-Perception Enhanced Video Grounding method for LLM
%A Li, Xuefen
%A Wang, Bo
%A Shi, Ge
%A Feng, Chong
%A Teng, Jiahao
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F li-etal-2025-mitigating
%X Existing video LLMs typically excel at capturing the overall description of a video but lack the ability to demonstrate an understanding of temporal dynamics and a fine-grained grasp of localized content within the video. In this paper, we propose a Time-Perception Enhanced Video Grounding via Boundary Perception and Temporal Reasoning aimed at mitigating LLMs’ difficulties in understanding the discrepancies between video and text temporality. Specifically, to address the inherent biases in current datasets, we design a series of boundary-perception tasks to enable LLMs to capture accurate video temporality. To tackle LLMs’ insufficient understanding of temporal information, we develop specialized tasks for boundary perception and temporal relationship reasoning to deepen LLMs’ perception of video temporality. Our experimental results show significant improvements across three datasets: ActivityNet, Charades, and DiDeMo (achieving up to 11.2% improvement on R@0.3), demonstrating the effectiveness of our proposed temporal awareness-enhanced data construction method.
%U https://aclanthology.org/2025.coling-main.655/
%P 9804-9813
Markdown (Informal)
[Mitigating the Discrepancy Between Video and Text Temporal Sequences: A Time-Perception Enhanced Video Grounding method for LLM](https://aclanthology.org/2025.coling-main.655/) (Li et al., COLING 2025)
ACL