@inproceedings{chen-etal-2025-make,
title = "Make Imagination Clearer! Stable Diffusion-based Visual Imagination for Multimodal Machine Translation",
author = "Chen, Andong and
Song, Yuchen and
Chen, Kehai and
Bai, Xuefeng and
Yang, Muyun and
Nie, Liqiang and
Liu, Jie and
Zhao, Tiejun and
Zhang, Min",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1289/",
doi = "10.18653/v1/2025.acl-long.1289",
pages = "26567--26583",
ISBN = "979-8-89176-251-0",
abstract = "Visual information has been introduced for enhancing machine translation (MT), and its effectiveness heavily relies on the availability of large amounts of bilingual parallel sentence pairs with manual image annotations. In this paper, we introduce a stable diffusion-based imagination network into a multimodal large language model (MLLM) to explicitly generate an image for each source sentence, thereby advancing the multimodel MT. Particularly, we build heuristic feedback with reinforcement learning to ensure the consistency of the generated image with the source sentence without the supervision of visual information, which breaks the high-cost bottleneck of image annotation in MT. Furthermore, the proposed method enables imaginative visual information to be integrated into text-only MT in addition to multimodal MT. Experimental results show that our model significantly outperforms existing multimodal MT and text-only MT, especially achieving an average improvement of more than 14 BLEU points on Multi30K and MSCOCO multimodal MT benchmarks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-make">
<titleInfo>
<title>Make Imagination Clearer! Stable Diffusion-based Visual Imagination for Multimodal Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuchen</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kehai</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuefeng</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muyun</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liqiang</namePart>
<namePart type="family">Nie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiejun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Visual information has been introduced for enhancing machine translation (MT), and its effectiveness heavily relies on the availability of large amounts of bilingual parallel sentence pairs with manual image annotations. In this paper, we introduce a stable diffusion-based imagination network into a multimodal large language model (MLLM) to explicitly generate an image for each source sentence, thereby advancing the multimodel MT. Particularly, we build heuristic feedback with reinforcement learning to ensure the consistency of the generated image with the source sentence without the supervision of visual information, which breaks the high-cost bottleneck of image annotation in MT. Furthermore, the proposed method enables imaginative visual information to be integrated into text-only MT in addition to multimodal MT. Experimental results show that our model significantly outperforms existing multimodal MT and text-only MT, especially achieving an average improvement of more than 14 BLEU points on Multi30K and MSCOCO multimodal MT benchmarks.</abstract>
<identifier type="citekey">chen-etal-2025-make</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1289</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1289/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>26567</start>
<end>26583</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Make Imagination Clearer! Stable Diffusion-based Visual Imagination for Multimodal Machine Translation
%A Chen, Andong
%A Song, Yuchen
%A Chen, Kehai
%A Bai, Xuefeng
%A Yang, Muyun
%A Nie, Liqiang
%A Liu, Jie
%A Zhao, Tiejun
%A Zhang, Min
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F chen-etal-2025-make
%X Visual information has been introduced for enhancing machine translation (MT), and its effectiveness heavily relies on the availability of large amounts of bilingual parallel sentence pairs with manual image annotations. In this paper, we introduce a stable diffusion-based imagination network into a multimodal large language model (MLLM) to explicitly generate an image for each source sentence, thereby advancing the multimodel MT. Particularly, we build heuristic feedback with reinforcement learning to ensure the consistency of the generated image with the source sentence without the supervision of visual information, which breaks the high-cost bottleneck of image annotation in MT. Furthermore, the proposed method enables imaginative visual information to be integrated into text-only MT in addition to multimodal MT. Experimental results show that our model significantly outperforms existing multimodal MT and text-only MT, especially achieving an average improvement of more than 14 BLEU points on Multi30K and MSCOCO multimodal MT benchmarks.
%R 10.18653/v1/2025.acl-long.1289
%U https://aclanthology.org/2025.acl-long.1289/
%U https://doi.org/10.18653/v1/2025.acl-long.1289
%P 26567-26583
Markdown (Informal)
[Make Imagination Clearer! Stable Diffusion-based Visual Imagination for Multimodal Machine Translation](https://aclanthology.org/2025.acl-long.1289/) (Chen et al., ACL 2025)
ACL
- Andong Chen, Yuchen Song, Kehai Chen, Xuefeng Bai, Muyun Yang, Liqiang Nie, Jie Liu, Tiejun Zhao, and Min Zhang. 2025. Make Imagination Clearer! Stable Diffusion-based Visual Imagination for Multimodal Machine Translation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 26567–26583, Vienna, Austria. Association for Computational Linguistics.