@inproceedings{chowdhury-etal-2025-towards,
title = "Towards Multilingual spoken Visual Question Answering system using Cross-Attention",
author = "Chowdhury, Amartya Roy and
Rajkhowa, Tonmoy and
Sharma, Sanjeev",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.615/",
pages = "9165--9175",
abstract = "Visual question answering (VQA) poses a multi-modal translation challenge that requires the analysis of both images and questions simultaneously to generate appropriate responses. Although VQA research has mainly focused on text-based questions in English, speech-based questions in English and other languages remain largely unexplored. Incorporating speech could significantly enhance the utility of VQA systems, as speech is the primary mode of human communication. To address this gap, this work implements a speech-based VQA system and introduces the textless multilingual visual question answering (TM-VQA) dataset, featuring speech-based questions in English, German, Spanish, and French. This TM-VQA dataset contains 658,111 pairs of speech-based questions and answers based on 123,287 images. Finally, a novel, cross-attention-based unified multi-modal framework is presented to evaluate the efficacy of the TM-VQA dataset. The experimental results indicate the effectiveness of the proposed unified approach over the cascaded framework for both text and speech-based VQA systems. Dataset can be accessed at https://github.com/Synaptic-Coder/TM-VQA."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chowdhury-etal-2025-towards">
<titleInfo>
<title>Towards Multilingual spoken Visual Question Answering system using Cross-Attention</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amartya</namePart>
<namePart type="given">Roy</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tonmoy</namePart>
<namePart type="family">Rajkhowa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanjeev</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Visual question answering (VQA) poses a multi-modal translation challenge that requires the analysis of both images and questions simultaneously to generate appropriate responses. Although VQA research has mainly focused on text-based questions in English, speech-based questions in English and other languages remain largely unexplored. Incorporating speech could significantly enhance the utility of VQA systems, as speech is the primary mode of human communication. To address this gap, this work implements a speech-based VQA system and introduces the textless multilingual visual question answering (TM-VQA) dataset, featuring speech-based questions in English, German, Spanish, and French. This TM-VQA dataset contains 658,111 pairs of speech-based questions and answers based on 123,287 images. Finally, a novel, cross-attention-based unified multi-modal framework is presented to evaluate the efficacy of the TM-VQA dataset. The experimental results indicate the effectiveness of the proposed unified approach over the cascaded framework for both text and speech-based VQA systems. Dataset can be accessed at https://github.com/Synaptic-Coder/TM-VQA.</abstract>
<identifier type="citekey">chowdhury-etal-2025-towards</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.615/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>9165</start>
<end>9175</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Multilingual spoken Visual Question Answering system using Cross-Attention
%A Chowdhury, Amartya Roy
%A Rajkhowa, Tonmoy
%A Sharma, Sanjeev
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F chowdhury-etal-2025-towards
%X Visual question answering (VQA) poses a multi-modal translation challenge that requires the analysis of both images and questions simultaneously to generate appropriate responses. Although VQA research has mainly focused on text-based questions in English, speech-based questions in English and other languages remain largely unexplored. Incorporating speech could significantly enhance the utility of VQA systems, as speech is the primary mode of human communication. To address this gap, this work implements a speech-based VQA system and introduces the textless multilingual visual question answering (TM-VQA) dataset, featuring speech-based questions in English, German, Spanish, and French. This TM-VQA dataset contains 658,111 pairs of speech-based questions and answers based on 123,287 images. Finally, a novel, cross-attention-based unified multi-modal framework is presented to evaluate the efficacy of the TM-VQA dataset. The experimental results indicate the effectiveness of the proposed unified approach over the cascaded framework for both text and speech-based VQA systems. Dataset can be accessed at https://github.com/Synaptic-Coder/TM-VQA.
%U https://aclanthology.org/2025.coling-main.615/
%P 9165-9175
Markdown (Informal)
[Towards Multilingual spoken Visual Question Answering system using Cross-Attention](https://aclanthology.org/2025.coling-main.615/) (Chowdhury et al., COLING 2025)
ACL