@inproceedings{hwang-etal-2026-perceptual,
title = "Perceptual Hallucination in Vision{--}Language Models: Definition, Analysis and Verification",
author = "Hwang, Taewook and
Heo, Inbum and
Lee, Sung Jun and
Jung, Sangkeun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1237/",
pages = "24710--24725",
ISBN = "979-8-89176-395-1",
abstract = "Vision-Language Models (VLMs) have demonstrated remarkable performance in document understanding tasks; however, VLMs also suffer from hallucinations inherited from LLMs. While prior work has focused on reasoning-stage hallucinations, the role of visual perception remains underexplored. In this work, we define perceptual hallucination as the phenomenon where VLMs generate information as if perceived, despite absent or damaged visual evidence. To analyze this, we construct DocHallu, a benchmark of 2,671 original{--}damaged image pairs across three tasks, available at https://huggingface.co/datasets/IB99/DocHallu. Experiments reveal that perceptual hallucination occurs across all models, with higher rates for numerical content than textual content. Activation patching analysis suggests that hallucinations are strongly associated with errors introduced in the vision encoder, which can subsequently propagate and become amplified through the text decoding process. We also demonstrate that LLM-based post-hoc filtering can reduce hallucination exposure by 36{\%} on average, with reductions of up to 88{\%}. This work extends VLM hallucination research by defining, analyzing, and verifying perceptual hallucination in document understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hwang-etal-2026-perceptual">
<titleInfo>
<title>Perceptual Hallucination in Vision–Language Models: Definition, Analysis and Verification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taewook</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Inbum</namePart>
<namePart type="family">Heo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sung</namePart>
<namePart type="given">Jun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sangkeun</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Vision-Language Models (VLMs) have demonstrated remarkable performance in document understanding tasks; however, VLMs also suffer from hallucinations inherited from LLMs. While prior work has focused on reasoning-stage hallucinations, the role of visual perception remains underexplored. In this work, we define perceptual hallucination as the phenomenon where VLMs generate information as if perceived, despite absent or damaged visual evidence. To analyze this, we construct DocHallu, a benchmark of 2,671 original–damaged image pairs across three tasks, available at https://huggingface.co/datasets/IB99/DocHallu. Experiments reveal that perceptual hallucination occurs across all models, with higher rates for numerical content than textual content. Activation patching analysis suggests that hallucinations are strongly associated with errors introduced in the vision encoder, which can subsequently propagate and become amplified through the text decoding process. We also demonstrate that LLM-based post-hoc filtering can reduce hallucination exposure by 36% on average, with reductions of up to 88%. This work extends VLM hallucination research by defining, analyzing, and verifying perceptual hallucination in document understanding.</abstract>
<identifier type="citekey">hwang-etal-2026-perceptual</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1237/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>24710</start>
<end>24725</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Perceptual Hallucination in Vision–Language Models: Definition, Analysis and Verification
%A Hwang, Taewook
%A Heo, Inbum
%A Lee, Sung Jun
%A Jung, Sangkeun
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F hwang-etal-2026-perceptual
%X Vision-Language Models (VLMs) have demonstrated remarkable performance in document understanding tasks; however, VLMs also suffer from hallucinations inherited from LLMs. While prior work has focused on reasoning-stage hallucinations, the role of visual perception remains underexplored. In this work, we define perceptual hallucination as the phenomenon where VLMs generate information as if perceived, despite absent or damaged visual evidence. To analyze this, we construct DocHallu, a benchmark of 2,671 original–damaged image pairs across three tasks, available at https://huggingface.co/datasets/IB99/DocHallu. Experiments reveal that perceptual hallucination occurs across all models, with higher rates for numerical content than textual content. Activation patching analysis suggests that hallucinations are strongly associated with errors introduced in the vision encoder, which can subsequently propagate and become amplified through the text decoding process. We also demonstrate that LLM-based post-hoc filtering can reduce hallucination exposure by 36% on average, with reductions of up to 88%. This work extends VLM hallucination research by defining, analyzing, and verifying perceptual hallucination in document understanding.
%U https://aclanthology.org/2026.findings-acl.1237/
%P 24710-24725
Markdown (Informal)
[Perceptual Hallucination in Vision–Language Models: Definition, Analysis and Verification](https://aclanthology.org/2026.findings-acl.1237/) (Hwang et al., Findings 2026)
ACL