@inproceedings{lei-etal-2026-live,
title = "Live-Aid: A Large-Scale Dialogue Dataset and Benchmark for Interleaved Multi-party Interactions in Live Streaming",
author = "Lei, Yiming and
Fan, Yize and
Liu, Zeming and
Dong, Jiaji and
Qiu, Hui and
Leng, Haitao and
Liu, Qingjie and
Chen, Kehai and
Gao, Tingting and
Wang, Yunhong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1193/",
pages = "23813--23850",
ISBN = "979-8-89176-395-1",
abstract = "Recent advancements in Multimodal Large Language Models (MLLMs) have achieved significant success in understanding static pre-recorded video scenarios (e.g., event-centric or narrative-driven content). However, existing MLLMs are largely trained on datasets restricted to static content due to the scarcity of high-quality interleaved data, causing them to struggle with dynamic interactions. Distinct from pre-recorded videos, live streaming is characterized by high-density, interleaved multimodal turns, where viewer comments (danmaku) are tightly coupled with real-time audio-visual evidence and evolving dialogue context. In such settings, purely textual annotations fail to capture fine-grained visual and temporal dependencies. To bridge this gap, we introduce **Live-Aid**, the first large-scale interleaved live interaction Chinese dataset with **human-annotated**, temporally aligned video responses, spanning over **1,100 hours** and 80,037 dialogue turns across 8,053 video sessions. Building on this, we leverage these high-quality annotations within a novel multi-agent pipeline to construct evaluation tasks targeting core capabilities of live interactions. Extensive evaluations of strong Video-LLMs and Omni-LLMs reveal critical limitations in interleaved multi-turn interactions requiring temporal reasoning, highlighting the value of **Live-Aid** in advancing interleaved multimodal reasoning and dynamic audio-visual dependencies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lei-etal-2026-live">
<titleInfo>
<title>Live-Aid: A Large-Scale Dialogue Dataset and Benchmark for Interleaved Multi-party Interactions in Live Streaming</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Lei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yize</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaji</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Qiu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haitao</namePart>
<namePart type="family">Leng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingjie</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kehai</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tingting</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunhong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recent advancements in Multimodal Large Language Models (MLLMs) have achieved significant success in understanding static pre-recorded video scenarios (e.g., event-centric or narrative-driven content). However, existing MLLMs are largely trained on datasets restricted to static content due to the scarcity of high-quality interleaved data, causing them to struggle with dynamic interactions. Distinct from pre-recorded videos, live streaming is characterized by high-density, interleaved multimodal turns, where viewer comments (danmaku) are tightly coupled with real-time audio-visual evidence and evolving dialogue context. In such settings, purely textual annotations fail to capture fine-grained visual and temporal dependencies. To bridge this gap, we introduce **Live-Aid**, the first large-scale interleaved live interaction Chinese dataset with **human-annotated**, temporally aligned video responses, spanning over **1,100 hours** and 80,037 dialogue turns across 8,053 video sessions. Building on this, we leverage these high-quality annotations within a novel multi-agent pipeline to construct evaluation tasks targeting core capabilities of live interactions. Extensive evaluations of strong Video-LLMs and Omni-LLMs reveal critical limitations in interleaved multi-turn interactions requiring temporal reasoning, highlighting the value of **Live-Aid** in advancing interleaved multimodal reasoning and dynamic audio-visual dependencies.</abstract>
<identifier type="citekey">lei-etal-2026-live</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1193/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>23813</start>
<end>23850</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Live-Aid: A Large-Scale Dialogue Dataset and Benchmark for Interleaved Multi-party Interactions in Live Streaming
%A Lei, Yiming
%A Fan, Yize
%A Liu, Zeming
%A Dong, Jiaji
%A Qiu, Hui
%A Leng, Haitao
%A Liu, Qingjie
%A Chen, Kehai
%A Gao, Tingting
%A Wang, Yunhong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F lei-etal-2026-live
%X Recent advancements in Multimodal Large Language Models (MLLMs) have achieved significant success in understanding static pre-recorded video scenarios (e.g., event-centric or narrative-driven content). However, existing MLLMs are largely trained on datasets restricted to static content due to the scarcity of high-quality interleaved data, causing them to struggle with dynamic interactions. Distinct from pre-recorded videos, live streaming is characterized by high-density, interleaved multimodal turns, where viewer comments (danmaku) are tightly coupled with real-time audio-visual evidence and evolving dialogue context. In such settings, purely textual annotations fail to capture fine-grained visual and temporal dependencies. To bridge this gap, we introduce **Live-Aid**, the first large-scale interleaved live interaction Chinese dataset with **human-annotated**, temporally aligned video responses, spanning over **1,100 hours** and 80,037 dialogue turns across 8,053 video sessions. Building on this, we leverage these high-quality annotations within a novel multi-agent pipeline to construct evaluation tasks targeting core capabilities of live interactions. Extensive evaluations of strong Video-LLMs and Omni-LLMs reveal critical limitations in interleaved multi-turn interactions requiring temporal reasoning, highlighting the value of **Live-Aid** in advancing interleaved multimodal reasoning and dynamic audio-visual dependencies.
%U https://aclanthology.org/2026.findings-acl.1193/
%P 23813-23850
Markdown (Informal)
[Live-Aid: A Large-Scale Dialogue Dataset and Benchmark for Interleaved Multi-party Interactions in Live Streaming](https://aclanthology.org/2026.findings-acl.1193/) (Lei et al., Findings 2026)
ACL
- Yiming Lei, Yize Fan, Zeming Liu, Jiaji Dong, Hui Qiu, Haitao Leng, Qingjie Liu, Kehai Chen, Tingting Gao, and Yunhong Wang. 2026. Live-Aid: A Large-Scale Dialogue Dataset and Benchmark for Interleaved Multi-party Interactions in Live Streaming. In Findings of the Association for Computational Linguistics: ACL 2026, pages 23813–23850, San Diego, California, United States. Association for Computational Linguistics.