@inproceedings{kuribayashi-baldwin-2025-vision,
title = "Does Vision Accelerate Hierarchical Generalization in Neural Language Learners?",
author = "Kuribayashi, Tatsuki and
Baldwin, Timothy",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.127/",
pages = "1865--1879",
abstract = "Neural language models (LMs) are arguably less data-efficient than humans from a language acquisition perspective. One fundamental question is why this human{--}LM gap arises. This study explores the advantage of \textit{grounded} language acquisition, specifically the impact of visual information {---} which humans can usually rely on but LMs largely do not have access to during language acquisition {---} on syntactic generalization in LMs. Our experiments, following the poverty of stimulus paradigm under two scenarios (using artificial vs. naturalistic images), demonstrate that if the alignments between the linguistic and visual components are clear in the input, access to vision data does help with the syntactic generalization of LMs, but if not, visual input does not help. This highlights the need for additional biases or signals, such as mutual gaze, to enhance cross-modal alignment and enable efficient syntactic generalization in multimodal LMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuribayashi-baldwin-2025-vision">
<titleInfo>
<title>Does Vision Accelerate Hierarchical Generalization in Neural Language Learners?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatsuki</namePart>
<namePart type="family">Kuribayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural language models (LMs) are arguably less data-efficient than humans from a language acquisition perspective. One fundamental question is why this human–LM gap arises. This study explores the advantage of grounded language acquisition, specifically the impact of visual information — which humans can usually rely on but LMs largely do not have access to during language acquisition — on syntactic generalization in LMs. Our experiments, following the poverty of stimulus paradigm under two scenarios (using artificial vs. naturalistic images), demonstrate that if the alignments between the linguistic and visual components are clear in the input, access to vision data does help with the syntactic generalization of LMs, but if not, visual input does not help. This highlights the need for additional biases or signals, such as mutual gaze, to enhance cross-modal alignment and enable efficient syntactic generalization in multimodal LMs.</abstract>
<identifier type="citekey">kuribayashi-baldwin-2025-vision</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.127/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1865</start>
<end>1879</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does Vision Accelerate Hierarchical Generalization in Neural Language Learners?
%A Kuribayashi, Tatsuki
%A Baldwin, Timothy
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F kuribayashi-baldwin-2025-vision
%X Neural language models (LMs) are arguably less data-efficient than humans from a language acquisition perspective. One fundamental question is why this human–LM gap arises. This study explores the advantage of grounded language acquisition, specifically the impact of visual information — which humans can usually rely on but LMs largely do not have access to during language acquisition — on syntactic generalization in LMs. Our experiments, following the poverty of stimulus paradigm under two scenarios (using artificial vs. naturalistic images), demonstrate that if the alignments between the linguistic and visual components are clear in the input, access to vision data does help with the syntactic generalization of LMs, but if not, visual input does not help. This highlights the need for additional biases or signals, such as mutual gaze, to enhance cross-modal alignment and enable efficient syntactic generalization in multimodal LMs.
%U https://aclanthology.org/2025.coling-main.127/
%P 1865-1879
Markdown (Informal)
[Does Vision Accelerate Hierarchical Generalization in Neural Language Learners?](https://aclanthology.org/2025.coling-main.127/) (Kuribayashi & Baldwin, COLING 2025)
ACL