@inproceedings{liang-etal-2025-improving,
title = "Improving {MLLM}{'}s Document Image Machine Translation via Synchronously Self-reviewing Its {OCR} Proficiency",
author = "Liang, Yupu and
Zhang, Yaping and
Zhang, Zhiyang and
Chen, Zhiyuan and
Zhao, Yang and
Xiang, Lu and
Zong, Chengqing and
Zhou, Yu",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1213/",
doi = "10.18653/v1/2025.findings-acl.1213",
pages = "23659--23678",
ISBN = "979-8-89176-256-5",
abstract = "Multimodal Large Language Models (MLLMs) have shown strong performance in document image tasks, especially Optical Character Recognition (OCR). However, they struggle with Document Image Machine Translation (DIMT), which requires handling both cross-modal and cross-lingual challenges. Previous efforts to enhance DIMT capability through Supervised Fine-Tuning (SFT) on the DIMT dataset often result in the forgetting of the model{'}s existing monolingual abilities, such as OCR. To address these challenges, we introduce a novel fine-tuning paradigm, named Synchronously Self-Reviewing (SSR) its OCR proficiency, inspired by the concept ``Bilingual Cognitive Advantage''. Specifically, SSR prompts the model to generate OCR text before producing translation text, which allows the model to leverage its strong monolingual OCR ability while learning to translate text across languages. Comprehensive experiments demonstrate the proposed SSR learning helps mitigate catastrophic forgetting, improving the generalization ability of MLLMs on both OCR and DIMT tasks. The code will be released upon acceptance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liang-etal-2025-improving">
<titleInfo>
<title>Improving MLLM’s Document Image Machine Translation via Synchronously Self-reviewing Its OCR Proficiency</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yupu</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaping</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiyuan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Multimodal Large Language Models (MLLMs) have shown strong performance in document image tasks, especially Optical Character Recognition (OCR). However, they struggle with Document Image Machine Translation (DIMT), which requires handling both cross-modal and cross-lingual challenges. Previous efforts to enhance DIMT capability through Supervised Fine-Tuning (SFT) on the DIMT dataset often result in the forgetting of the model’s existing monolingual abilities, such as OCR. To address these challenges, we introduce a novel fine-tuning paradigm, named Synchronously Self-Reviewing (SSR) its OCR proficiency, inspired by the concept “Bilingual Cognitive Advantage”. Specifically, SSR prompts the model to generate OCR text before producing translation text, which allows the model to leverage its strong monolingual OCR ability while learning to translate text across languages. Comprehensive experiments demonstrate the proposed SSR learning helps mitigate catastrophic forgetting, improving the generalization ability of MLLMs on both OCR and DIMT tasks. The code will be released upon acceptance.</abstract>
<identifier type="citekey">liang-etal-2025-improving</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1213</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1213/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>23659</start>
<end>23678</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving MLLM’s Document Image Machine Translation via Synchronously Self-reviewing Its OCR Proficiency
%A Liang, Yupu
%A Zhang, Yaping
%A Zhang, Zhiyang
%A Chen, Zhiyuan
%A Zhao, Yang
%A Xiang, Lu
%A Zong, Chengqing
%A Zhou, Yu
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F liang-etal-2025-improving
%X Multimodal Large Language Models (MLLMs) have shown strong performance in document image tasks, especially Optical Character Recognition (OCR). However, they struggle with Document Image Machine Translation (DIMT), which requires handling both cross-modal and cross-lingual challenges. Previous efforts to enhance DIMT capability through Supervised Fine-Tuning (SFT) on the DIMT dataset often result in the forgetting of the model’s existing monolingual abilities, such as OCR. To address these challenges, we introduce a novel fine-tuning paradigm, named Synchronously Self-Reviewing (SSR) its OCR proficiency, inspired by the concept “Bilingual Cognitive Advantage”. Specifically, SSR prompts the model to generate OCR text before producing translation text, which allows the model to leverage its strong monolingual OCR ability while learning to translate text across languages. Comprehensive experiments demonstrate the proposed SSR learning helps mitigate catastrophic forgetting, improving the generalization ability of MLLMs on both OCR and DIMT tasks. The code will be released upon acceptance.
%R 10.18653/v1/2025.findings-acl.1213
%U https://aclanthology.org/2025.findings-acl.1213/
%U https://doi.org/10.18653/v1/2025.findings-acl.1213
%P 23659-23678
Markdown (Informal)
[Improving MLLM’s Document Image Machine Translation via Synchronously Self-reviewing Its OCR Proficiency](https://aclanthology.org/2025.findings-acl.1213/) (Liang et al., Findings 2025)
ACL
- Yupu Liang, Yaping Zhang, Zhiyang Zhang, Zhiyuan Chen, Yang Zhao, Lu Xiang, Chengqing Zong, and Yu Zhou. 2025. Improving MLLM’s Document Image Machine Translation via Synchronously Self-reviewing Its OCR Proficiency. In Findings of the Association for Computational Linguistics: ACL 2025, pages 23659–23678, Vienna, Austria. Association for Computational Linguistics.