@inproceedings{zhu-etal-2025-murar,
title = "{M}u{RAR}: A Simple and Effective Multimodal Retrieval and Answer Refinement Framework for Multimodal Question Answering",
author = "Zhu, Zhengyuan and
Lee, Daniel and
Zhang, Hong and
Sree Harsha, Sai and
Feujio, Loic and
Maharaj, Akash and
Li, Yunyao",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Mather, Brodie and
Dras, Mark",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: System Demonstrations",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-demos.13/",
pages = "126--135",
abstract = "Recent advancements in retrieval-augmented generation have demonstrated impressive performance on the question-answering task. However, most previous work predominantly focuses on text-based answers. Although some studies have explored multimodal data, they still fall short in generating comprehensive multimodal answers, especially step-by-step tutorials for accomplishing specific goals. This capability is especially valuable in application scenarios such as enterprise chatbots, customer service systems, and educational platforms. In this paper, we propose a simple and effective framework, MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR starts by generating an initial text answer based on the user`s question. It then retrieves multimodal data relevant to the snippets of the initial text answer. By leveraging the retrieved multimodal data and contextual features, MuRAR refines the initial text answer to create a more comprehensive and informative response. This highly adaptable framework can be easily integrated into an enterprise chatbot to produce multimodal answers with minimal modifications. Human evaluations demonstrate that the multimodal answers generated by MuRAR are significantly more useful and readable than plain text responses. A video demo of MuRAR is available at https://youtu.be/ykGRtyVVQpU."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhu-etal-2025-murar">
<titleInfo>
<title>MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement Framework for Multimodal Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhengyuan</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="family">Sree Harsha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loic</namePart>
<namePart type="family">Feujio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akash</namePart>
<namePart type="family">Maharaj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brodie</namePart>
<namePart type="family">Mather</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Dras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advancements in retrieval-augmented generation have demonstrated impressive performance on the question-answering task. However, most previous work predominantly focuses on text-based answers. Although some studies have explored multimodal data, they still fall short in generating comprehensive multimodal answers, especially step-by-step tutorials for accomplishing specific goals. This capability is especially valuable in application scenarios such as enterprise chatbots, customer service systems, and educational platforms. In this paper, we propose a simple and effective framework, MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR starts by generating an initial text answer based on the user‘s question. It then retrieves multimodal data relevant to the snippets of the initial text answer. By leveraging the retrieved multimodal data and contextual features, MuRAR refines the initial text answer to create a more comprehensive and informative response. This highly adaptable framework can be easily integrated into an enterprise chatbot to produce multimodal answers with minimal modifications. Human evaluations demonstrate that the multimodal answers generated by MuRAR are significantly more useful and readable than plain text responses. A video demo of MuRAR is available at https://youtu.be/ykGRtyVVQpU.</abstract>
<identifier type="citekey">zhu-etal-2025-murar</identifier>
<location>
<url>https://aclanthology.org/2025.coling-demos.13/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>126</start>
<end>135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement Framework for Multimodal Question Answering
%A Zhu, Zhengyuan
%A Lee, Daniel
%A Zhang, Hong
%A Sree Harsha, Sai
%A Feujio, Loic
%A Maharaj, Akash
%A Li, Yunyao
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%Y Mather, Brodie
%Y Dras, Mark
%S Proceedings of the 31st International Conference on Computational Linguistics: System Demonstrations
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F zhu-etal-2025-murar
%X Recent advancements in retrieval-augmented generation have demonstrated impressive performance on the question-answering task. However, most previous work predominantly focuses on text-based answers. Although some studies have explored multimodal data, they still fall short in generating comprehensive multimodal answers, especially step-by-step tutorials for accomplishing specific goals. This capability is especially valuable in application scenarios such as enterprise chatbots, customer service systems, and educational platforms. In this paper, we propose a simple and effective framework, MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR starts by generating an initial text answer based on the user‘s question. It then retrieves multimodal data relevant to the snippets of the initial text answer. By leveraging the retrieved multimodal data and contextual features, MuRAR refines the initial text answer to create a more comprehensive and informative response. This highly adaptable framework can be easily integrated into an enterprise chatbot to produce multimodal answers with minimal modifications. Human evaluations demonstrate that the multimodal answers generated by MuRAR are significantly more useful and readable than plain text responses. A video demo of MuRAR is available at https://youtu.be/ykGRtyVVQpU.
%U https://aclanthology.org/2025.coling-demos.13/
%P 126-135
Markdown (Informal)
[MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement Framework for Multimodal Question Answering](https://aclanthology.org/2025.coling-demos.13/) (Zhu et al., COLING 2025)
ACL