@inproceedings{cao-etal-2026-hisdoc,
title = "{H}is{D}oc-{OCR}: Restoring Visual Grounding in {MLLM}s for {C}hinese Historical Document {OCR}",
author = "Cao, Jiahuan and
Shi, Yongxin and
Shan, Zeyu and
Lu, Zhengyang and
Jin, Lianwen",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.301/",
pages = "6053--6066",
ISBN = "979-8-89176-395-1",
abstract = "Chinese historical documents encode millennia of cultural heritage, yet remain largely inaccessible to computational analysis. While multimodal large language models (MLLMs) have achieved strong performance on modern document OCR, their application to historical Chinese texts suffers from severe hallucinations, character fabrication, uncontrolled repetition, and semantic drift. We identify the root cause as visual-textual misalignment: models prioritize linguistic priors over visual evidence, particularly problematic when archaic orthography and degraded image quality destabilize cross-modal correspondences. To address this, we propose HisDoc-OCR, which restores visual grounding through three synergistic strategies: (1) Layout Injection, which encodes two-dimensional layout structures into textual outputs using layout-aware delimiters; (2) First-Occurrence Boost, which emphasizes vision-dependent characters during training by reweighting first-occurrence characters; (3) Self-Distilled Attention Focusing, which guides the model{'}s attention by distilling patterns from the most focused layer to the remaining layers. Extensive experiments demonstrate that HisDoc-OCR consistently outperforms general-purpose and OCR-specific MLLMs. The code will be publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cao-etal-2026-hisdoc">
<titleInfo>
<title>HisDoc-OCR: Restoring Visual Grounding in MLLMs for Chinese Historical Document OCR</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiahuan</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yongxin</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeyu</namePart>
<namePart type="family">Shan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengyang</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lianwen</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Chinese historical documents encode millennia of cultural heritage, yet remain largely inaccessible to computational analysis. While multimodal large language models (MLLMs) have achieved strong performance on modern document OCR, their application to historical Chinese texts suffers from severe hallucinations, character fabrication, uncontrolled repetition, and semantic drift. We identify the root cause as visual-textual misalignment: models prioritize linguistic priors over visual evidence, particularly problematic when archaic orthography and degraded image quality destabilize cross-modal correspondences. To address this, we propose HisDoc-OCR, which restores visual grounding through three synergistic strategies: (1) Layout Injection, which encodes two-dimensional layout structures into textual outputs using layout-aware delimiters; (2) First-Occurrence Boost, which emphasizes vision-dependent characters during training by reweighting first-occurrence characters; (3) Self-Distilled Attention Focusing, which guides the model’s attention by distilling patterns from the most focused layer to the remaining layers. Extensive experiments demonstrate that HisDoc-OCR consistently outperforms general-purpose and OCR-specific MLLMs. The code will be publicly available.</abstract>
<identifier type="citekey">cao-etal-2026-hisdoc</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.301/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>6053</start>
<end>6066</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HisDoc-OCR: Restoring Visual Grounding in MLLMs for Chinese Historical Document OCR
%A Cao, Jiahuan
%A Shi, Yongxin
%A Shan, Zeyu
%A Lu, Zhengyang
%A Jin, Lianwen
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F cao-etal-2026-hisdoc
%X Chinese historical documents encode millennia of cultural heritage, yet remain largely inaccessible to computational analysis. While multimodal large language models (MLLMs) have achieved strong performance on modern document OCR, their application to historical Chinese texts suffers from severe hallucinations, character fabrication, uncontrolled repetition, and semantic drift. We identify the root cause as visual-textual misalignment: models prioritize linguistic priors over visual evidence, particularly problematic when archaic orthography and degraded image quality destabilize cross-modal correspondences. To address this, we propose HisDoc-OCR, which restores visual grounding through three synergistic strategies: (1) Layout Injection, which encodes two-dimensional layout structures into textual outputs using layout-aware delimiters; (2) First-Occurrence Boost, which emphasizes vision-dependent characters during training by reweighting first-occurrence characters; (3) Self-Distilled Attention Focusing, which guides the model’s attention by distilling patterns from the most focused layer to the remaining layers. Extensive experiments demonstrate that HisDoc-OCR consistently outperforms general-purpose and OCR-specific MLLMs. The code will be publicly available.
%U https://aclanthology.org/2026.findings-acl.301/
%P 6053-6066
Markdown (Informal)
[HisDoc-OCR: Restoring Visual Grounding in MLLMs for Chinese Historical Document OCR](https://aclanthology.org/2026.findings-acl.301/) (Cao et al., Findings 2026)
ACL