@inproceedings{jones-trott-2024-multimodal,
title = "Multimodal Language Models Show Evidence of Embodied Simulation",
author = "Jones, Cameron R. and
Trott, Sean",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1041",
pages = "11928--11933",
abstract = "Multimodal large language models (MLLMs) are gaining popularity as partial solutions to the {``}symbol grounding problem{''} faced by language models trained on text alone. However, little is known about whether and how these multiple modalities are integrated. We draw inspiration from analogous work in human psycholinguistics on embodied simulation, i.e., the hypothesis that language comprehension is grounded in sensorimotor representations. We show that MLLMs are sensitive to implicit visual features like object shape (e.g., {``}The egg was in the skillet{''} implies a frying egg rather than one in a shell). This suggests that MLLMs activate implicit information about object shape when it is implied by a verbal description of an event. We find mixed results for color and orientation, and rule out the possibility that this is due to models{'} insensitivity to those features in our dataset overall. We suggest that both human psycholinguistics and computational models of language could benefit from cross-pollination, e.g., with the potential to establish whether grounded representations play a functional role in language processing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jones-trott-2024-multimodal">
<titleInfo>
<title>Multimodal Language Models Show Evidence of Embodied Simulation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cameron</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Jones</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="family">Trott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal large language models (MLLMs) are gaining popularity as partial solutions to the “symbol grounding problem” faced by language models trained on text alone. However, little is known about whether and how these multiple modalities are integrated. We draw inspiration from analogous work in human psycholinguistics on embodied simulation, i.e., the hypothesis that language comprehension is grounded in sensorimotor representations. We show that MLLMs are sensitive to implicit visual features like object shape (e.g., “The egg was in the skillet” implies a frying egg rather than one in a shell). This suggests that MLLMs activate implicit information about object shape when it is implied by a verbal description of an event. We find mixed results for color and orientation, and rule out the possibility that this is due to models’ insensitivity to those features in our dataset overall. We suggest that both human psycholinguistics and computational models of language could benefit from cross-pollination, e.g., with the potential to establish whether grounded representations play a functional role in language processing.</abstract>
<identifier type="citekey">jones-trott-2024-multimodal</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1041</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>11928</start>
<end>11933</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Language Models Show Evidence of Embodied Simulation
%A Jones, Cameron R.
%A Trott, Sean
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F jones-trott-2024-multimodal
%X Multimodal large language models (MLLMs) are gaining popularity as partial solutions to the “symbol grounding problem” faced by language models trained on text alone. However, little is known about whether and how these multiple modalities are integrated. We draw inspiration from analogous work in human psycholinguistics on embodied simulation, i.e., the hypothesis that language comprehension is grounded in sensorimotor representations. We show that MLLMs are sensitive to implicit visual features like object shape (e.g., “The egg was in the skillet” implies a frying egg rather than one in a shell). This suggests that MLLMs activate implicit information about object shape when it is implied by a verbal description of an event. We find mixed results for color and orientation, and rule out the possibility that this is due to models’ insensitivity to those features in our dataset overall. We suggest that both human psycholinguistics and computational models of language could benefit from cross-pollination, e.g., with the potential to establish whether grounded representations play a functional role in language processing.
%U https://aclanthology.org/2024.lrec-main.1041
%P 11928-11933
Markdown (Informal)
[Multimodal Language Models Show Evidence of Embodied Simulation](https://aclanthology.org/2024.lrec-main.1041) (Jones & Trott, LREC-COLING 2024)
ACL