@inproceedings{mishra-etal-2026-cebc,
title = "{CEBC}: Conformal Evidence-Bounded Control for Low-Hallucination Vision{--}Language Generation",
author = "Mishra, Ashish and
Kumar, Tarun and
Shah, Arpit and
Bhattacharya, Suparna and
Foltin, Martin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.2142/",
doi = "10.18653/v1/2026.acl-long.2142",
pages = "46193--46206",
ISBN = "979-8-89176-390-6",
abstract = "Hallucinated object mentions remain a persistent failure mode of vision{--}language models (VLMs) across generation tasks such as image captioning and visual question answering: outputs may be fluent yet include entities not supported by visual evidence. Existing mitigation approaches often reduce hallucinations at the cost of degraded generation quality or require expensive retraining and task-specific supervision. We introduce CEBC, a lightweight, training-free framework for low-hallucination vision{--}language generation based on conformal evidence-bounded minimal editing. CEBC first produces a strong base output (via greedy decoding or best-of-K sampling), then applies an evidence-bounded editing step that minimally revises or suppresses unsupported object mentions using constraints derived from an external visual detector. Crucially, the evidence threshold is conformally calibrated on a small held-out set via quantiles of detector confidence scores, enabling explicit and controllable hallucination risk at test time.To balance factuality and informativeness, we further introduce a risk-first, quality-aware selection rule that prioritizes evidence-consistent generations while regularizing unnecessary length or lexical drift. Extensive experiments on MS-COCO and GQA for image captioning, and POPE for VQA evaluation across multiple VLMs demonstrate that CEBC consistently reduces hallucination rates(CHAIR{\_}S, CHAIR{\_}I, POPE) while maintaining or improving standard generation quality metrics (CIDEr, BLEU, CLIPScore). CEBC establishes a stronger factuality{--}quality Pareto frontier without any additional model training or access to paired supervision beyond an off-the-shelf detector."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mishra-etal-2026-cebc">
<titleInfo>
<title>CEBC: Conformal Evidence-Bounded Control for Low-Hallucination Vision–Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tarun</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arpit</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suparna</namePart>
<namePart type="family">Bhattacharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Foltin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Hallucinated object mentions remain a persistent failure mode of vision–language models (VLMs) across generation tasks such as image captioning and visual question answering: outputs may be fluent yet include entities not supported by visual evidence. Existing mitigation approaches often reduce hallucinations at the cost of degraded generation quality or require expensive retraining and task-specific supervision. We introduce CEBC, a lightweight, training-free framework for low-hallucination vision–language generation based on conformal evidence-bounded minimal editing. CEBC first produces a strong base output (via greedy decoding or best-of-K sampling), then applies an evidence-bounded editing step that minimally revises or suppresses unsupported object mentions using constraints derived from an external visual detector. Crucially, the evidence threshold is conformally calibrated on a small held-out set via quantiles of detector confidence scores, enabling explicit and controllable hallucination risk at test time.To balance factuality and informativeness, we further introduce a risk-first, quality-aware selection rule that prioritizes evidence-consistent generations while regularizing unnecessary length or lexical drift. Extensive experiments on MS-COCO and GQA for image captioning, and POPE for VQA evaluation across multiple VLMs demonstrate that CEBC consistently reduces hallucination rates(CHAIR_S, CHAIR_I, POPE) while maintaining or improving standard generation quality metrics (CIDEr, BLEU, CLIPScore). CEBC establishes a stronger factuality–quality Pareto frontier without any additional model training or access to paired supervision beyond an off-the-shelf detector.</abstract>
<identifier type="citekey">mishra-etal-2026-cebc</identifier>
<identifier type="doi">10.18653/v1/2026.acl-long.2142</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.2142/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>46193</start>
<end>46206</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CEBC: Conformal Evidence-Bounded Control for Low-Hallucination Vision–Language Generation
%A Mishra, Ashish
%A Kumar, Tarun
%A Shah, Arpit
%A Bhattacharya, Suparna
%A Foltin, Martin
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F mishra-etal-2026-cebc
%X Hallucinated object mentions remain a persistent failure mode of vision–language models (VLMs) across generation tasks such as image captioning and visual question answering: outputs may be fluent yet include entities not supported by visual evidence. Existing mitigation approaches often reduce hallucinations at the cost of degraded generation quality or require expensive retraining and task-specific supervision. We introduce CEBC, a lightweight, training-free framework for low-hallucination vision–language generation based on conformal evidence-bounded minimal editing. CEBC first produces a strong base output (via greedy decoding or best-of-K sampling), then applies an evidence-bounded editing step that minimally revises or suppresses unsupported object mentions using constraints derived from an external visual detector. Crucially, the evidence threshold is conformally calibrated on a small held-out set via quantiles of detector confidence scores, enabling explicit and controllable hallucination risk at test time.To balance factuality and informativeness, we further introduce a risk-first, quality-aware selection rule that prioritizes evidence-consistent generations while regularizing unnecessary length or lexical drift. Extensive experiments on MS-COCO and GQA for image captioning, and POPE for VQA evaluation across multiple VLMs demonstrate that CEBC consistently reduces hallucination rates(CHAIR_S, CHAIR_I, POPE) while maintaining or improving standard generation quality metrics (CIDEr, BLEU, CLIPScore). CEBC establishes a stronger factuality–quality Pareto frontier without any additional model training or access to paired supervision beyond an off-the-shelf detector.
%R 10.18653/v1/2026.acl-long.2142
%U https://aclanthology.org/2026.acl-long.2142/
%U https://doi.org/10.18653/v1/2026.acl-long.2142
%P 46193-46206
Markdown (Informal)
[CEBC: Conformal Evidence-Bounded Control for Low-Hallucination Vision–Language Generation](https://aclanthology.org/2026.acl-long.2142/) (Mishra et al., ACL 2026)
ACL