@inproceedings{deng-etal-2025-muka,
title = "{M}u{KA}: Multimodal Knowledge Augmented Visual Information-Seeking",
author = "Deng, Lianghao and
Sun, Yuchong and
Chen, Shizhe and
Yang, Ning and
Wang, Yunfeng and
Song, Ruihua",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.647/",
pages = "9675--9686",
abstract = "The visual information-seeking task aims to answer visual questions that require external knowledge, such as {\textquotedblleft}On what date did this building officially open?{\textquotedblright}. Existing methods using retrieval-augmented generation framework primarily rely on textual knowledge bases to assist multimodal large language models (MLLMs) in answering questions. However, the text-only knowledge can impair information retrieval for the multimodal query of image and question, and also confuse MLLMs in selecting the most relevant information during generation. In this work, we propose a novel framework MuKA which leverages a multimodal knowledge base to address these limitations. Specifically, we construct a multimodal knowledge base by automatically pairing images with text passages in existing datasets. We then design a fine-grained multimodal interaction to effectively retrieve multimodal documents and enrich MLLMs with both retrieved texts and images. MuKA outperforms state-of-the-art methods by 38.7{\%} and 15.9{\%} on the InfoSeek and E-VQA benchmark respectively, demonstrating the importance of multimodal knowledge in enhancing both retrieval and answer generation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deng-etal-2025-muka">
<titleInfo>
<title>MuKA: Multimodal Knowledge Augmented Visual Information-Seeking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lianghao</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuchong</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shizhe</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ning</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunfeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruihua</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The visual information-seeking task aims to answer visual questions that require external knowledge, such as “On what date did this building officially open?”. Existing methods using retrieval-augmented generation framework primarily rely on textual knowledge bases to assist multimodal large language models (MLLMs) in answering questions. However, the text-only knowledge can impair information retrieval for the multimodal query of image and question, and also confuse MLLMs in selecting the most relevant information during generation. In this work, we propose a novel framework MuKA which leverages a multimodal knowledge base to address these limitations. Specifically, we construct a multimodal knowledge base by automatically pairing images with text passages in existing datasets. We then design a fine-grained multimodal interaction to effectively retrieve multimodal documents and enrich MLLMs with both retrieved texts and images. MuKA outperforms state-of-the-art methods by 38.7% and 15.9% on the InfoSeek and E-VQA benchmark respectively, demonstrating the importance of multimodal knowledge in enhancing both retrieval and answer generation.</abstract>
<identifier type="citekey">deng-etal-2025-muka</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.647/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>9675</start>
<end>9686</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MuKA: Multimodal Knowledge Augmented Visual Information-Seeking
%A Deng, Lianghao
%A Sun, Yuchong
%A Chen, Shizhe
%A Yang, Ning
%A Wang, Yunfeng
%A Song, Ruihua
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F deng-etal-2025-muka
%X The visual information-seeking task aims to answer visual questions that require external knowledge, such as “On what date did this building officially open?”. Existing methods using retrieval-augmented generation framework primarily rely on textual knowledge bases to assist multimodal large language models (MLLMs) in answering questions. However, the text-only knowledge can impair information retrieval for the multimodal query of image and question, and also confuse MLLMs in selecting the most relevant information during generation. In this work, we propose a novel framework MuKA which leverages a multimodal knowledge base to address these limitations. Specifically, we construct a multimodal knowledge base by automatically pairing images with text passages in existing datasets. We then design a fine-grained multimodal interaction to effectively retrieve multimodal documents and enrich MLLMs with both retrieved texts and images. MuKA outperforms state-of-the-art methods by 38.7% and 15.9% on the InfoSeek and E-VQA benchmark respectively, demonstrating the importance of multimodal knowledge in enhancing both retrieval and answer generation.
%U https://aclanthology.org/2025.coling-main.647/
%P 9675-9686
Markdown (Informal)
[MuKA: Multimodal Knowledge Augmented Visual Information-Seeking](https://aclanthology.org/2025.coling-main.647/) (Deng et al., COLING 2025)
ACL