@inproceedings{villa-cueva-etal-2025-moments,
title = "{M}o{M}ent{S}: A Comprehensive Multimodal Benchmark for Theory of Mind",
author = "Villa-Cueva, Emilio and
Ahmed, S M Masrur and
Chevi, Rendi and
Cruz, Jan Christian Blaise and
Elzeky, Kareem and
Cristobal, Fermin and
Aji, Alham Fikri and
Wang, Skyler and
Mihalcea, Rada and
Solorio, Thamar",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1230/",
doi = "10.18653/v1/2025.findings-emnlp.1230",
pages = "22591--22611",
ISBN = "979-8-89176-335-7",
abstract = "Understanding Theory of Mind is essential for building socially intelligent multimodal agents capable of perceiving and interpreting human behavior. We introduce MoMentS (Multimodal Mental States), a comprehensive benchmark designed to assess the ToM capabilities of multimodal large language models (LLMs) through realistic, narrative-rich scenarios presented in short films. MoMentS includes over 2,300 multiple-choice questions spanning seven distinct ToM categories. The benchmark features long video context windows and realistic social interactions that provide deeper insight into characters' mental states. We evaluate several MLLMs and find that although vision generally improves performance, models still struggle to integrate it effectively. For audio, models that process dialogues as audio do not consistently outperform transcript-based inputs. Our findings highlight the need to improve multimodal integration and point to open challenges that must be addressed to advance AI{'}s social understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="villa-cueva-etal-2025-moments">
<titleInfo>
<title>MoMentS: A Comprehensive Multimodal Benchmark for Theory of Mind</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emilio</namePart>
<namePart type="family">Villa-Cueva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">S</namePart>
<namePart type="given">M</namePart>
<namePart type="given">Masrur</namePart>
<namePart type="family">Ahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rendi</namePart>
<namePart type="family">Chevi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="given">Christian</namePart>
<namePart type="given">Blaise</namePart>
<namePart type="family">Cruz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Elzeky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fermin</namePart>
<namePart type="family">Cristobal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Skyler</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rada</namePart>
<namePart type="family">Mihalcea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thamar</namePart>
<namePart type="family">Solorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Understanding Theory of Mind is essential for building socially intelligent multimodal agents capable of perceiving and interpreting human behavior. We introduce MoMentS (Multimodal Mental States), a comprehensive benchmark designed to assess the ToM capabilities of multimodal large language models (LLMs) through realistic, narrative-rich scenarios presented in short films. MoMentS includes over 2,300 multiple-choice questions spanning seven distinct ToM categories. The benchmark features long video context windows and realistic social interactions that provide deeper insight into characters’ mental states. We evaluate several MLLMs and find that although vision generally improves performance, models still struggle to integrate it effectively. For audio, models that process dialogues as audio do not consistently outperform transcript-based inputs. Our findings highlight the need to improve multimodal integration and point to open challenges that must be addressed to advance AI’s social understanding.</abstract>
<identifier type="citekey">villa-cueva-etal-2025-moments</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1230</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1230/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>22591</start>
<end>22611</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MoMentS: A Comprehensive Multimodal Benchmark for Theory of Mind
%A Villa-Cueva, Emilio
%A Ahmed, S. M. Masrur
%A Chevi, Rendi
%A Cruz, Jan Christian Blaise
%A Elzeky, Kareem
%A Cristobal, Fermin
%A Aji, Alham Fikri
%A Wang, Skyler
%A Mihalcea, Rada
%A Solorio, Thamar
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F villa-cueva-etal-2025-moments
%X Understanding Theory of Mind is essential for building socially intelligent multimodal agents capable of perceiving and interpreting human behavior. We introduce MoMentS (Multimodal Mental States), a comprehensive benchmark designed to assess the ToM capabilities of multimodal large language models (LLMs) through realistic, narrative-rich scenarios presented in short films. MoMentS includes over 2,300 multiple-choice questions spanning seven distinct ToM categories. The benchmark features long video context windows and realistic social interactions that provide deeper insight into characters’ mental states. We evaluate several MLLMs and find that although vision generally improves performance, models still struggle to integrate it effectively. For audio, models that process dialogues as audio do not consistently outperform transcript-based inputs. Our findings highlight the need to improve multimodal integration and point to open challenges that must be addressed to advance AI’s social understanding.
%R 10.18653/v1/2025.findings-emnlp.1230
%U https://aclanthology.org/2025.findings-emnlp.1230/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1230
%P 22591-22611
Markdown (Informal)
[MoMentS: A Comprehensive Multimodal Benchmark for Theory of Mind](https://aclanthology.org/2025.findings-emnlp.1230/) (Villa-Cueva et al., Findings 2025)
ACL
- Emilio Villa-Cueva, S M Masrur Ahmed, Rendi Chevi, Jan Christian Blaise Cruz, Kareem Elzeky, Fermin Cristobal, Alham Fikri Aji, Skyler Wang, Rada Mihalcea, and Thamar Solorio. 2025. MoMentS: A Comprehensive Multimodal Benchmark for Theory of Mind. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 22591–22611, Suzhou, China. Association for Computational Linguistics.