@inproceedings{chang-etal-2026-multimodal,
title = "Multimodal Conversation Structure Understanding",
author = "Chang, Kent K. and
Cramer, Mackenzie Hanh and
Ho, Anna and
Nguyen, Ti Ti and
Yuan, Yilin and
Bamman, David",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.349/",
pages = "7437--7458",
ISBN = "979-8-89176-380-7",
abstract = "While multimodal large language models (LLMs) excel at dialogue, whether they can adequately parse the structure of conversation{---}conversational roles and threading{---}remains underexplored. In this work, we introduce a suite of tasks and release TV-MMPC, a new annotated dataset, for multimodal conversation structure understanding. Our evaluation reveals that while all multimodal LLMs outperform our heuristic baseline, even the best-performing model we consider experiences a substantial drop in performance when character identities of the conversation are anonymized. Beyond evaluation, we carry out a sociolinguistic analysis of 350,842 utterances in TVQA. We find that while female characters initiate conversations at rates in proportion to their speaking time, they are 1.2 times more likely than men to be cast as an addressee or side-participant, and the presence of side-participants shifts the conversational register from personal to social."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chang-etal-2026-multimodal">
<titleInfo>
<title>Multimodal Conversation Structure Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kent</namePart>
<namePart type="given">K</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mackenzie</namePart>
<namePart type="given">Hanh</namePart>
<namePart type="family">Cramer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Ho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ti</namePart>
<namePart type="given">Ti</namePart>
<namePart type="family">Nguyen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yilin</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Bamman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>While multimodal large language models (LLMs) excel at dialogue, whether they can adequately parse the structure of conversation—conversational roles and threading—remains underexplored. In this work, we introduce a suite of tasks and release TV-MMPC, a new annotated dataset, for multimodal conversation structure understanding. Our evaluation reveals that while all multimodal LLMs outperform our heuristic baseline, even the best-performing model we consider experiences a substantial drop in performance when character identities of the conversation are anonymized. Beyond evaluation, we carry out a sociolinguistic analysis of 350,842 utterances in TVQA. We find that while female characters initiate conversations at rates in proportion to their speaking time, they are 1.2 times more likely than men to be cast as an addressee or side-participant, and the presence of side-participants shifts the conversational register from personal to social.</abstract>
<identifier type="citekey">chang-etal-2026-multimodal</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.349/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>7437</start>
<end>7458</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Conversation Structure Understanding
%A Chang, Kent K.
%A Cramer, Mackenzie Hanh
%A Ho, Anna
%A Nguyen, Ti Ti
%A Yuan, Yilin
%A Bamman, David
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F chang-etal-2026-multimodal
%X While multimodal large language models (LLMs) excel at dialogue, whether they can adequately parse the structure of conversation—conversational roles and threading—remains underexplored. In this work, we introduce a suite of tasks and release TV-MMPC, a new annotated dataset, for multimodal conversation structure understanding. Our evaluation reveals that while all multimodal LLMs outperform our heuristic baseline, even the best-performing model we consider experiences a substantial drop in performance when character identities of the conversation are anonymized. Beyond evaluation, we carry out a sociolinguistic analysis of 350,842 utterances in TVQA. We find that while female characters initiate conversations at rates in proportion to their speaking time, they are 1.2 times more likely than men to be cast as an addressee or side-participant, and the presence of side-participants shifts the conversational register from personal to social.
%U https://aclanthology.org/2026.eacl-long.349/
%P 7437-7458
Markdown (Informal)
[Multimodal Conversation Structure Understanding](https://aclanthology.org/2026.eacl-long.349/) (Chang et al., EACL 2026)
ACL
- Kent K. Chang, Mackenzie Hanh Cramer, Anna Ho, Ti Ti Nguyen, Yilin Yuan, and David Bamman. 2026. Multimodal Conversation Structure Understanding. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 7437–7458, Rabat, Morocco. Association for Computational Linguistics.