@inproceedings{yu-etal-2025-imagination,
title = "Imagination and Contemplation: A Balanced Framework for Semantic-Augmented Multimodal Machine Translation",
author = "Yu, Zhuang and
Sun, Shiliang and
Zhao, Jing and
Song, Tengfei and
Yang, Hao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.579/",
doi = "10.18653/v1/2025.findings-emnlp.579",
pages = "10913--10928",
ISBN = "979-8-89176-335-7",
abstract = "Multimodal Machine Translation (MMT) enhances textual translation through auxiliary inputs such as images, which is particularly effective in resolving linguistic ambiguities. However, visual information often introduces redundancy or noise, potentially impairing translation quality. To address this challenge, we propose a balanced semantic-augmented framework that integrates ``Imagination{``} and ``Contemplation{``} in multimodal understanding. Specifically, we first generate synthetic images from the source text and align them with the authentic images via an optimal transport (OT) loss to enhance visual-semantic consistency. A CLIP-based similarity gating mechanism is introduced to adaptively fuse visual features from both authentic and synthetic images during visual representation learning. To strengthen semantic grounding, a neural machine translation (NMT) branch is incorporated as a regularization signal, and a Kullback-Leibler (KL) divergence is applied between MMT and NMT outputs to mitigate modality mismatch. Furthermore, an image-text contrastive (ITC) loss aligns the final translations with image representations, reinforcing multimodal coherence. Experiments on multiple translation datasets with a diverse set of language pairs demonstrate that our framework outperforms existing baselines, particularly in cases with visually ambiguous or weakly correlated content."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2025-imagination">
<titleInfo>
<title>Imagination and Contemplation: A Balanced Framework for Semantic-Augmented Multimodal Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhuang</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiliang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tengfei</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Multimodal Machine Translation (MMT) enhances textual translation through auxiliary inputs such as images, which is particularly effective in resolving linguistic ambiguities. However, visual information often introduces redundancy or noise, potentially impairing translation quality. To address this challenge, we propose a balanced semantic-augmented framework that integrates “Imagination“ and “Contemplation“ in multimodal understanding. Specifically, we first generate synthetic images from the source text and align them with the authentic images via an optimal transport (OT) loss to enhance visual-semantic consistency. A CLIP-based similarity gating mechanism is introduced to adaptively fuse visual features from both authentic and synthetic images during visual representation learning. To strengthen semantic grounding, a neural machine translation (NMT) branch is incorporated as a regularization signal, and a Kullback-Leibler (KL) divergence is applied between MMT and NMT outputs to mitigate modality mismatch. Furthermore, an image-text contrastive (ITC) loss aligns the final translations with image representations, reinforcing multimodal coherence. Experiments on multiple translation datasets with a diverse set of language pairs demonstrate that our framework outperforms existing baselines, particularly in cases with visually ambiguous or weakly correlated content.</abstract>
<identifier type="citekey">yu-etal-2025-imagination</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.579</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.579/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>10913</start>
<end>10928</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Imagination and Contemplation: A Balanced Framework for Semantic-Augmented Multimodal Machine Translation
%A Yu, Zhuang
%A Sun, Shiliang
%A Zhao, Jing
%A Song, Tengfei
%A Yang, Hao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F yu-etal-2025-imagination
%X Multimodal Machine Translation (MMT) enhances textual translation through auxiliary inputs such as images, which is particularly effective in resolving linguistic ambiguities. However, visual information often introduces redundancy or noise, potentially impairing translation quality. To address this challenge, we propose a balanced semantic-augmented framework that integrates “Imagination“ and “Contemplation“ in multimodal understanding. Specifically, we first generate synthetic images from the source text and align them with the authentic images via an optimal transport (OT) loss to enhance visual-semantic consistency. A CLIP-based similarity gating mechanism is introduced to adaptively fuse visual features from both authentic and synthetic images during visual representation learning. To strengthen semantic grounding, a neural machine translation (NMT) branch is incorporated as a regularization signal, and a Kullback-Leibler (KL) divergence is applied between MMT and NMT outputs to mitigate modality mismatch. Furthermore, an image-text contrastive (ITC) loss aligns the final translations with image representations, reinforcing multimodal coherence. Experiments on multiple translation datasets with a diverse set of language pairs demonstrate that our framework outperforms existing baselines, particularly in cases with visually ambiguous or weakly correlated content.
%R 10.18653/v1/2025.findings-emnlp.579
%U https://aclanthology.org/2025.findings-emnlp.579/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.579
%P 10913-10928
Markdown (Informal)
[Imagination and Contemplation: A Balanced Framework for Semantic-Augmented Multimodal Machine Translation](https://aclanthology.org/2025.findings-emnlp.579/) (Yu et al., Findings 2025)
ACL