@article{jones-etal-2024-multimodal,
title = "Do Multimodal Large Language Models and Humans Ground Language Similarly?",
author = "Jones, Cameron R. and
Bergen, Benjamin and
Trott, Sean",
journal = "Computational Linguistics",
volume = "50",
number = "3",
month = dec,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-4.7/",
doi = "10.1162/coli_a_00531",
pages = "1415--1440",
abstract = "Large Language Models (LLMs) have been criticized for failing to connect linguistic meaning to the world{---}for failing to solve the {\textquotedblleft}symbol grounding problem.{\textquotedblright} Multimodal Large Language Models (MLLMs) offer a potential solution to this challenge by combining linguistic representations and processing with other modalities. However, much is still unknown about exactly how and to what degree MLLMs integrate their distinct modalities{---}and whether the way they do so mirrors the mechanisms believed to underpin grounding in humans. In humans, it has been hypothesized that linguistic meaning is grounded through {\textquotedblleft}embodied simulation,{\textquotedblright} the activation of sensorimotor and affective representations reflecting described experiences. Across four pre-registered studies, we adapt experimental techniques originally developed to investigate embodied simulation in human comprehenders to ask whether MLLMs are sensitive to sensorimotor features that are implied but not explicit in descriptions of an event. In Experiment 1, we find sensitivity to some features (color and shape) but not others (size, orientation, and volume). In Experiment 2, we identify likely bottlenecks to explain an MLLM`s lack of sensitivity. In Experiment 3, we find that despite sensitivity to implicit sensorimotor features, MLLMs cannot fully account for human behavior on the same task. Finally, in Experiment 4, we compare the psychometric predictive power of different MLLM architectures and find that ViLT, a single-stream architecture, is more predictive of human responses to one sensorimotor feature (shape) than CLIP, a dual-encoder architecture{---}despite being trained on orders of magnitude less data. These results reveal strengths and limitations in the ability of current MLLMs to integrate language with other modalities, and also shed light on the likely mechanisms underlying human language comprehension."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jones-etal-2024-multimodal">
<titleInfo>
<title>Do Multimodal Large Language Models and Humans Ground Language Similarly?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cameron</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Jones</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Bergen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sean</namePart>
<namePart type="family">Trott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have been criticized for failing to connect linguistic meaning to the world—for failing to solve the “symbol grounding problem.” Multimodal Large Language Models (MLLMs) offer a potential solution to this challenge by combining linguistic representations and processing with other modalities. However, much is still unknown about exactly how and to what degree MLLMs integrate their distinct modalities—and whether the way they do so mirrors the mechanisms believed to underpin grounding in humans. In humans, it has been hypothesized that linguistic meaning is grounded through “embodied simulation,” the activation of sensorimotor and affective representations reflecting described experiences. Across four pre-registered studies, we adapt experimental techniques originally developed to investigate embodied simulation in human comprehenders to ask whether MLLMs are sensitive to sensorimotor features that are implied but not explicit in descriptions of an event. In Experiment 1, we find sensitivity to some features (color and shape) but not others (size, orientation, and volume). In Experiment 2, we identify likely bottlenecks to explain an MLLM‘s lack of sensitivity. In Experiment 3, we find that despite sensitivity to implicit sensorimotor features, MLLMs cannot fully account for human behavior on the same task. Finally, in Experiment 4, we compare the psychometric predictive power of different MLLM architectures and find that ViLT, a single-stream architecture, is more predictive of human responses to one sensorimotor feature (shape) than CLIP, a dual-encoder architecture—despite being trained on orders of magnitude less data. These results reveal strengths and limitations in the ability of current MLLMs to integrate language with other modalities, and also shed light on the likely mechanisms underlying human language comprehension.</abstract>
<identifier type="citekey">jones-etal-2024-multimodal</identifier>
<identifier type="doi">10.1162/coli_a_00531</identifier>
<location>
<url>https://aclanthology.org/2024.cl-4.7/</url>
</location>
<part>
<date>2024-12</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>1415</start>
<end>1440</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Do Multimodal Large Language Models and Humans Ground Language Similarly?
%A Jones, Cameron R.
%A Bergen, Benjamin
%A Trott, Sean
%J Computational Linguistics
%D 2024
%8 December
%V 50
%N 3
%I MIT Press
%C Cambridge, MA
%F jones-etal-2024-multimodal
%X Large Language Models (LLMs) have been criticized for failing to connect linguistic meaning to the world—for failing to solve the “symbol grounding problem.” Multimodal Large Language Models (MLLMs) offer a potential solution to this challenge by combining linguistic representations and processing with other modalities. However, much is still unknown about exactly how and to what degree MLLMs integrate their distinct modalities—and whether the way they do so mirrors the mechanisms believed to underpin grounding in humans. In humans, it has been hypothesized that linguistic meaning is grounded through “embodied simulation,” the activation of sensorimotor and affective representations reflecting described experiences. Across four pre-registered studies, we adapt experimental techniques originally developed to investigate embodied simulation in human comprehenders to ask whether MLLMs are sensitive to sensorimotor features that are implied but not explicit in descriptions of an event. In Experiment 1, we find sensitivity to some features (color and shape) but not others (size, orientation, and volume). In Experiment 2, we identify likely bottlenecks to explain an MLLM‘s lack of sensitivity. In Experiment 3, we find that despite sensitivity to implicit sensorimotor features, MLLMs cannot fully account for human behavior on the same task. Finally, in Experiment 4, we compare the psychometric predictive power of different MLLM architectures and find that ViLT, a single-stream architecture, is more predictive of human responses to one sensorimotor feature (shape) than CLIP, a dual-encoder architecture—despite being trained on orders of magnitude less data. These results reveal strengths and limitations in the ability of current MLLMs to integrate language with other modalities, and also shed light on the likely mechanisms underlying human language comprehension.
%R 10.1162/coli_a_00531
%U https://aclanthology.org/2024.cl-4.7/
%U https://doi.org/10.1162/coli_a_00531
%P 1415-1440
Markdown (Informal)
[Do Multimodal Large Language Models and Humans Ground Language Similarly?](https://aclanthology.org/2024.cl-4.7/) (Jones et al., CL 2024)
ACL