@inproceedings{dong-etal-2025-bridging,
title = "Bridging Language and Scenes through Explicit 3-{D} Model Construction",
author = "Dong, Tiansi and
Das, Writwick and
Sifa, Rafet",
editor = "Liu, Kang and
Song, Yangqiu and
Han, Zhen and
Sifa, Rafet and
He, Shizhu and
Long, Yunfei",
booktitle = "Proceedings of Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning @ COLING 2025",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2025.neusymbridge-1.6/",
pages = "51--60",
abstract = "We introduce the methodology of explicit model construction to bridge linguistic descriptions and scene perception and demonstrate that in Visual Question-Answering (VQA) using MC4VQA (Model Construction for Visual Question-Answering), a method developed by us. Given a question about a scene, our MC4VQA first recognizes objects utilizing pre-trained deep learning systems. Then, it constructs an explicit 3-D layout by repeatedly reducing the difference between the input scene image and the image rendered from the current 3-D spatial environment. This novel {\textquotedblleft}iterative rendering{\textquotedblright} process endows MC4VQA the capability of acquiring spatial attributes without training data. MC4VQA outperforms NS-VQA (the SOTA system) by reaching 99.94{\%} accuracy on the benchmark CLEVR datasets, and is more robust than NS-VQA on new testing datasets. With newly created testing data, NS-VQA`s performance dropped to 97.60{\%}, while MC4VQA still kept the 99.0{\%} accuracy. This work sets a new SOTA performance of VQA on the benchmark CLEVR datasets, and shapes a new method that may solve the out-of-distribution problem."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dong-etal-2025-bridging">
<titleInfo>
<title>Bridging Language and Scenes through Explicit 3-D Model Construction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tiansi</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Writwick</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafet</namePart>
<namePart type="family">Sifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning @ COLING 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangqiu</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhen</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafet</namePart>
<namePart type="family">Sifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhu</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunfei</namePart>
<namePart type="family">Long</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce the methodology of explicit model construction to bridge linguistic descriptions and scene perception and demonstrate that in Visual Question-Answering (VQA) using MC4VQA (Model Construction for Visual Question-Answering), a method developed by us. Given a question about a scene, our MC4VQA first recognizes objects utilizing pre-trained deep learning systems. Then, it constructs an explicit 3-D layout by repeatedly reducing the difference between the input scene image and the image rendered from the current 3-D spatial environment. This novel “iterative rendering” process endows MC4VQA the capability of acquiring spatial attributes without training data. MC4VQA outperforms NS-VQA (the SOTA system) by reaching 99.94% accuracy on the benchmark CLEVR datasets, and is more robust than NS-VQA on new testing datasets. With newly created testing data, NS-VQA‘s performance dropped to 97.60%, while MC4VQA still kept the 99.0% accuracy. This work sets a new SOTA performance of VQA on the benchmark CLEVR datasets, and shapes a new method that may solve the out-of-distribution problem.</abstract>
<identifier type="citekey">dong-etal-2025-bridging</identifier>
<location>
<url>https://aclanthology.org/2025.neusymbridge-1.6/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>51</start>
<end>60</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Bridging Language and Scenes through Explicit 3-D Model Construction
%A Dong, Tiansi
%A Das, Writwick
%A Sifa, Rafet
%Y Liu, Kang
%Y Song, Yangqiu
%Y Han, Zhen
%Y Sifa, Rafet
%Y He, Shizhu
%Y Long, Yunfei
%S Proceedings of Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning @ COLING 2025
%D 2025
%8 January
%I ELRA and ICCL
%C Abu Dhabi, UAE
%F dong-etal-2025-bridging
%X We introduce the methodology of explicit model construction to bridge linguistic descriptions and scene perception and demonstrate that in Visual Question-Answering (VQA) using MC4VQA (Model Construction for Visual Question-Answering), a method developed by us. Given a question about a scene, our MC4VQA first recognizes objects utilizing pre-trained deep learning systems. Then, it constructs an explicit 3-D layout by repeatedly reducing the difference between the input scene image and the image rendered from the current 3-D spatial environment. This novel “iterative rendering” process endows MC4VQA the capability of acquiring spatial attributes without training data. MC4VQA outperforms NS-VQA (the SOTA system) by reaching 99.94% accuracy on the benchmark CLEVR datasets, and is more robust than NS-VQA on new testing datasets. With newly created testing data, NS-VQA‘s performance dropped to 97.60%, while MC4VQA still kept the 99.0% accuracy. This work sets a new SOTA performance of VQA on the benchmark CLEVR datasets, and shapes a new method that may solve the out-of-distribution problem.
%U https://aclanthology.org/2025.neusymbridge-1.6/
%P 51-60
Markdown (Informal)
[Bridging Language and Scenes through Explicit 3-D Model Construction](https://aclanthology.org/2025.neusymbridge-1.6/) (Dong et al., NeusymBridge 2025)
ACL