@inproceedings{chen-etal-2025-seeing,
title = "Seeing Beyond: Enhancing Visual Question Answering with Multi-Modal Retrieval",
author = "Chen, Boqi and
Khare, Anuj and
Kumar, Gaurav and
Akula, Arjun and
Narayana, Pradyumna",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Darwish, Kareem and
Agarwal, Apoorv",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: Industry Track",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-industry.35/",
pages = "410--421",
abstract = "Multi-modal Large language models (MLLMs) have made significant strides in complex content understanding and reasoning. However, they still suffer from model hallucination and lack of specific knowledge when facing challenging questions. To address these limitations, retrieval augmented generation (RAG) has emerged as an effective solution. While incorporating knowledge has led to improvements, it also highlights the need for a more robust knowledge selection strategy. For multi-modal tasks, such as visual question answering (VQA), integrating all modalities is crucial in providing comprehensive information for accurate answers. Therefore, we propose to construct an encoder model for extracting joint embedding from all modalities, enabling alignment between the corresponding query and knowledge through contrastive learning. To further improve performance, we introduce an additional MLLM re-selection step, which selects the best matching knowledge from the top-k retrieved results of our alignment model. We evaluated our method, SeBe-VQA, on the Encyclopedic VQA dataset. Our knowledge retrieval results demonstrate the benefit of our multi-modal framework. By incorporating the retrieved knowledge along with the question, we achieve a significant performance improvement compared with the previous method and scenarios without knowledge provision."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-seeing">
<titleInfo>
<title>Seeing Beyond: Enhancing Visual Question Answering with Multi-Modal Retrieval</title>
</titleInfo>
<name type="personal">
<namePart type="given">Boqi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anuj</namePart>
<namePart type="family">Khare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaurav</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arjun</namePart>
<namePart type="family">Akula</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pradyumna</namePart>
<namePart type="family">Narayana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apoorv</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multi-modal Large language models (MLLMs) have made significant strides in complex content understanding and reasoning. However, they still suffer from model hallucination and lack of specific knowledge when facing challenging questions. To address these limitations, retrieval augmented generation (RAG) has emerged as an effective solution. While incorporating knowledge has led to improvements, it also highlights the need for a more robust knowledge selection strategy. For multi-modal tasks, such as visual question answering (VQA), integrating all modalities is crucial in providing comprehensive information for accurate answers. Therefore, we propose to construct an encoder model for extracting joint embedding from all modalities, enabling alignment between the corresponding query and knowledge through contrastive learning. To further improve performance, we introduce an additional MLLM re-selection step, which selects the best matching knowledge from the top-k retrieved results of our alignment model. We evaluated our method, SeBe-VQA, on the Encyclopedic VQA dataset. Our knowledge retrieval results demonstrate the benefit of our multi-modal framework. By incorporating the retrieved knowledge along with the question, we achieve a significant performance improvement compared with the previous method and scenarios without knowledge provision.</abstract>
<identifier type="citekey">chen-etal-2025-seeing</identifier>
<location>
<url>https://aclanthology.org/2025.coling-industry.35/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>410</start>
<end>421</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Seeing Beyond: Enhancing Visual Question Answering with Multi-Modal Retrieval
%A Chen, Boqi
%A Khare, Anuj
%A Kumar, Gaurav
%A Akula, Arjun
%A Narayana, Pradyumna
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%Y Darwish, Kareem
%Y Agarwal, Apoorv
%S Proceedings of the 31st International Conference on Computational Linguistics: Industry Track
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F chen-etal-2025-seeing
%X Multi-modal Large language models (MLLMs) have made significant strides in complex content understanding and reasoning. However, they still suffer from model hallucination and lack of specific knowledge when facing challenging questions. To address these limitations, retrieval augmented generation (RAG) has emerged as an effective solution. While incorporating knowledge has led to improvements, it also highlights the need for a more robust knowledge selection strategy. For multi-modal tasks, such as visual question answering (VQA), integrating all modalities is crucial in providing comprehensive information for accurate answers. Therefore, we propose to construct an encoder model for extracting joint embedding from all modalities, enabling alignment between the corresponding query and knowledge through contrastive learning. To further improve performance, we introduce an additional MLLM re-selection step, which selects the best matching knowledge from the top-k retrieved results of our alignment model. We evaluated our method, SeBe-VQA, on the Encyclopedic VQA dataset. Our knowledge retrieval results demonstrate the benefit of our multi-modal framework. By incorporating the retrieved knowledge along with the question, we achieve a significant performance improvement compared with the previous method and scenarios without knowledge provision.
%U https://aclanthology.org/2025.coling-industry.35/
%P 410-421
Markdown (Informal)
[Seeing Beyond: Enhancing Visual Question Answering with Multi-Modal Retrieval](https://aclanthology.org/2025.coling-industry.35/) (Chen et al., COLING 2025)
ACL