@inproceedings{zhang-etal-2025-vqa,
title = "{VQA}-Augmented Machine Translation with Cross-Modal Contrastive Learning",
author = "Zhang, Zhihui and
Sun, Shiliang and
Zhao, Jing and
Song, Tengfei and
Yang, Hao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.536/",
pages = "10113--10124",
ISBN = "979-8-89176-335-7",
abstract = "Multimodal machine translation (MMT) aims to enhance translation quality by integrating visual information. However, existing methods often extract visual features using pre-trained models while learning text features from scratch, leading to representation imbalance. These methods are also prone to being misled by redundant visual information, which results in suboptimal performance. To address these challenges, we propose CAMT, a novel cross-modal VQA-augmented MMT method. CAMT aligns image-source text pairs and image-question text pairs through dual-text contrastive learning, thereby improving semantic consistency across modalities. Additionally, we design an effective strategy for generating question{--}answer pairs to enhance fine-grained alignment and filter out irrelevant visual noise, while also addressing the scarcity of VQA annotations. Extensive experiments on multiple benchmark datasets demonstrate the effectiveness of the proposed CAMT framework, which consistently outperforms state-of-the-art MMT methods across multiple evaluation metrics."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2025-vqa">
<titleInfo>
<title>VQA-Augmented Machine Translation with Cross-Modal Contrastive Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhihui</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiliang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tengfei</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Multimodal machine translation (MMT) aims to enhance translation quality by integrating visual information. However, existing methods often extract visual features using pre-trained models while learning text features from scratch, leading to representation imbalance. These methods are also prone to being misled by redundant visual information, which results in suboptimal performance. To address these challenges, we propose CAMT, a novel cross-modal VQA-augmented MMT method. CAMT aligns image-source text pairs and image-question text pairs through dual-text contrastive learning, thereby improving semantic consistency across modalities. Additionally, we design an effective strategy for generating question–answer pairs to enhance fine-grained alignment and filter out irrelevant visual noise, while also addressing the scarcity of VQA annotations. Extensive experiments on multiple benchmark datasets demonstrate the effectiveness of the proposed CAMT framework, which consistently outperforms state-of-the-art MMT methods across multiple evaluation metrics.</abstract>
<identifier type="citekey">zhang-etal-2025-vqa</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.536/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>10113</start>
<end>10124</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VQA-Augmented Machine Translation with Cross-Modal Contrastive Learning
%A Zhang, Zhihui
%A Sun, Shiliang
%A Zhao, Jing
%A Song, Tengfei
%A Yang, Hao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F zhang-etal-2025-vqa
%X Multimodal machine translation (MMT) aims to enhance translation quality by integrating visual information. However, existing methods often extract visual features using pre-trained models while learning text features from scratch, leading to representation imbalance. These methods are also prone to being misled by redundant visual information, which results in suboptimal performance. To address these challenges, we propose CAMT, a novel cross-modal VQA-augmented MMT method. CAMT aligns image-source text pairs and image-question text pairs through dual-text contrastive learning, thereby improving semantic consistency across modalities. Additionally, we design an effective strategy for generating question–answer pairs to enhance fine-grained alignment and filter out irrelevant visual noise, while also addressing the scarcity of VQA annotations. Extensive experiments on multiple benchmark datasets demonstrate the effectiveness of the proposed CAMT framework, which consistently outperforms state-of-the-art MMT methods across multiple evaluation metrics.
%U https://aclanthology.org/2025.findings-emnlp.536/
%P 10113-10124
Markdown (Informal)
[VQA-Augmented Machine Translation with Cross-Modal Contrastive Learning](https://aclanthology.org/2025.findings-emnlp.536/) (Zhang et al., Findings 2025)
ACL