@inproceedings{yan-etal-2026-trace,
title = "{TRACE}: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation",
author = "Yan, Pengyu and
Gorugantu, Akhil V S S and
Bhosale, Mahesh and
Wasi, Abdul and
Trivedi, Vishvesh and
Doermann, David",
editor = "Murray, Kenton and
Kriz, Reno",
booktitle = "Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval ({MAGM}a{R} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.magmar-main.14/",
pages = "120--129",
ISBN = "979-8-89176-425-5",
abstract = "Multi-video event understanding demands models that can locate and attribute query-relevant evidence scattered across long, heterogeneous video corpora. Existing large vision{--}language models (LVLMs) often underperform in this regime because they quickly exhaust their context budget and struggle to precisely localize evidentially important segments, frequently missing dense informational cues such as broadcast graphics, subtitles, and scoreboards. We introduce TRACE, an evidence grounding-guided framework that follows a \textit{ground-before-reasoning} strategy for multi-video event reasoning. Our approach first builds a structured, text-searchable timeline for each video using OCR and object detection. A text-only LLM then conducts query-aware evidence localization, selecting relevant moments prior to any downstream visual reasoning. The retrieved frames and their grounding summaries are subsequently used to steer LVLM-based claim generation and cross-video citation consolidation. Experiments on MAGMaR 2026 and WikiVideo demonstrate that structured grounding markedly boosts factual completeness and attribution fidelity. On the MAGMaR validation split, TRACE raises macro-average MiRAGE F1 from 0.705 to 0.811 compared to an unguided Qwen3-VL-30B baseline, with especially strong improvements in citation recall (0.440 0.628). The method also attains state-of-the-art results on the official MAGMaR 2026 leaderboard."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yan-etal-2026-trace">
<titleInfo>
<title>TRACE: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pengyu</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akhil</namePart>
<namePart type="given">V</namePart>
<namePart type="given">S</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Gorugantu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahesh</namePart>
<namePart type="family">Bhosale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdul</namePart>
<namePart type="family">Wasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vishvesh</namePart>
<namePart type="family">Trivedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Doermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenton</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reno</namePart>
<namePart type="family">Kriz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-425-5</identifier>
</relatedItem>
<abstract>Multi-video event understanding demands models that can locate and attribute query-relevant evidence scattered across long, heterogeneous video corpora. Existing large vision–language models (LVLMs) often underperform in this regime because they quickly exhaust their context budget and struggle to precisely localize evidentially important segments, frequently missing dense informational cues such as broadcast graphics, subtitles, and scoreboards. We introduce TRACE, an evidence grounding-guided framework that follows a ground-before-reasoning strategy for multi-video event reasoning. Our approach first builds a structured, text-searchable timeline for each video using OCR and object detection. A text-only LLM then conducts query-aware evidence localization, selecting relevant moments prior to any downstream visual reasoning. The retrieved frames and their grounding summaries are subsequently used to steer LVLM-based claim generation and cross-video citation consolidation. Experiments on MAGMaR 2026 and WikiVideo demonstrate that structured grounding markedly boosts factual completeness and attribution fidelity. On the MAGMaR validation split, TRACE raises macro-average MiRAGE F1 from 0.705 to 0.811 compared to an unguided Qwen3-VL-30B baseline, with especially strong improvements in citation recall (0.440 0.628). The method also attains state-of-the-art results on the official MAGMaR 2026 leaderboard.</abstract>
<identifier type="citekey">yan-etal-2026-trace</identifier>
<location>
<url>https://aclanthology.org/2026.magmar-main.14/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>120</start>
<end>129</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TRACE: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation
%A Yan, Pengyu
%A Gorugantu, Akhil V. S. S.
%A Bhosale, Mahesh
%A Wasi, Abdul
%A Trivedi, Vishvesh
%A Doermann, David
%Y Murray, Kenton
%Y Kriz, Reno
%S Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA
%@ 979-8-89176-425-5
%F yan-etal-2026-trace
%X Multi-video event understanding demands models that can locate and attribute query-relevant evidence scattered across long, heterogeneous video corpora. Existing large vision–language models (LVLMs) often underperform in this regime because they quickly exhaust their context budget and struggle to precisely localize evidentially important segments, frequently missing dense informational cues such as broadcast graphics, subtitles, and scoreboards. We introduce TRACE, an evidence grounding-guided framework that follows a ground-before-reasoning strategy for multi-video event reasoning. Our approach first builds a structured, text-searchable timeline for each video using OCR and object detection. A text-only LLM then conducts query-aware evidence localization, selecting relevant moments prior to any downstream visual reasoning. The retrieved frames and their grounding summaries are subsequently used to steer LVLM-based claim generation and cross-video citation consolidation. Experiments on MAGMaR 2026 and WikiVideo demonstrate that structured grounding markedly boosts factual completeness and attribution fidelity. On the MAGMaR validation split, TRACE raises macro-average MiRAGE F1 from 0.705 to 0.811 compared to an unguided Qwen3-VL-30B baseline, with especially strong improvements in citation recall (0.440 0.628). The method also attains state-of-the-art results on the official MAGMaR 2026 leaderboard.
%U https://aclanthology.org/2026.magmar-main.14/
%P 120-129
Markdown (Informal)
[TRACE: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation](https://aclanthology.org/2026.magmar-main.14/) (Yan et al., MAGMaR 2026)
ACL