@inproceedings{kumar-etal-2025-vision,
title = "Does Vision Still Help? Multimodal Translation with {CLIP}-Based Image Selection",
author = "Kumar, Deepak and
Gain, Baban and
Singh, Kshetrimayum Boynao and
Ekbal, Asif",
editor = "Nakazawa, Toshiaki and
Goto, Isao",
booktitle = "Proceedings of the Twelfth Workshop on Asian Translation (WAT 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wat-1.12/",
pages = "115--123",
ISBN = "979-8-89176-309-8",
abstract = "Multimodal Machine Translation aims to enhance conventional text-only translation systems by incorporating visual context, typically in the form of images paired with captions. In this work, we present our submission to the WAT 2025 Multimodal Translation Shared Task, which explores the role of visual information in translating English captions into four Indic languages: Hindi, Bengali, Malayalam, and Odia. Our system builds upon the strong multilingual text translation backbone IndicTrans, augmented with a CLIP-based selective visual grounding mechanism. Specifically, we compute cosine similarities between text and image embeddings (both full and cropped regions) and automatically select the most semantically aligned image representation to integrate into the translation model. We observe that overall contribution of visual features is questionable. Our findings reaffirm recent evidence that large multilingual translation models can perform competitively without explicit visual grounding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2025-vision">
<titleInfo>
<title>Does Vision Still Help? Multimodal Translation with CLIP-Based Image Selection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baban</namePart>
<namePart type="family">Gain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kshetrimayum</namePart>
<namePart type="given">Boynao</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Workshop on Asian Translation (WAT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Toshiaki</namePart>
<namePart type="family">Nakazawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isao</namePart>
<namePart type="family">Goto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-309-8</identifier>
</relatedItem>
<abstract>Multimodal Machine Translation aims to enhance conventional text-only translation systems by incorporating visual context, typically in the form of images paired with captions. In this work, we present our submission to the WAT 2025 Multimodal Translation Shared Task, which explores the role of visual information in translating English captions into four Indic languages: Hindi, Bengali, Malayalam, and Odia. Our system builds upon the strong multilingual text translation backbone IndicTrans, augmented with a CLIP-based selective visual grounding mechanism. Specifically, we compute cosine similarities between text and image embeddings (both full and cropped regions) and automatically select the most semantically aligned image representation to integrate into the translation model. We observe that overall contribution of visual features is questionable. Our findings reaffirm recent evidence that large multilingual translation models can perform competitively without explicit visual grounding.</abstract>
<identifier type="citekey">kumar-etal-2025-vision</identifier>
<location>
<url>https://aclanthology.org/2025.wat-1.12/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>115</start>
<end>123</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does Vision Still Help? Multimodal Translation with CLIP-Based Image Selection
%A Kumar, Deepak
%A Gain, Baban
%A Singh, Kshetrimayum Boynao
%A Ekbal, Asif
%Y Nakazawa, Toshiaki
%Y Goto, Isao
%S Proceedings of the Twelfth Workshop on Asian Translation (WAT 2025)
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-309-8
%F kumar-etal-2025-vision
%X Multimodal Machine Translation aims to enhance conventional text-only translation systems by incorporating visual context, typically in the form of images paired with captions. In this work, we present our submission to the WAT 2025 Multimodal Translation Shared Task, which explores the role of visual information in translating English captions into four Indic languages: Hindi, Bengali, Malayalam, and Odia. Our system builds upon the strong multilingual text translation backbone IndicTrans, augmented with a CLIP-based selective visual grounding mechanism. Specifically, we compute cosine similarities between text and image embeddings (both full and cropped regions) and automatically select the most semantically aligned image representation to integrate into the translation model. We observe that overall contribution of visual features is questionable. Our findings reaffirm recent evidence that large multilingual translation models can perform competitively without explicit visual grounding.
%U https://aclanthology.org/2025.wat-1.12/
%P 115-123
Markdown (Informal)
[Does Vision Still Help? Multimodal Translation with CLIP-Based Image Selection](https://aclanthology.org/2025.wat-1.12/) (Kumar et al., WAT 2025)
ACL