@inproceedings{samyoun-etal-2026-attribution,
title = "Attribution-Guided Multi-Object Hallucination and Bias Detection in Vision-Language Models",
author = "Samyoun, Sirat and
Xiao, Yingtai and
Du, Jian",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.210/",
pages = "4529--4548",
ISBN = "979-8-89176-380-7",
abstract = "Vision-Language Models excel in multi-modal tasks but often hallucinate objects or exhibit linguistic bias by over-repeating object names, especially in complex multi-object scenes. Existing methods struggle with multi-object grounding because language priors frequently dominate visual evidence, causing hallucinated or biased objects to produce attention distributions or similarity scores nearly indistinguishable from those of real objects. We introduce SHAPLENS, a Shapley value{--}based attribution framework using Kernel SHAP and multi-layer fusion to detect hallucinated and biased objects. Evaluated on ADE and COCO datasets across four leading VLMs, SHAPLENS improves hallucination detection accuracy by 8{--}12{\%} and F1 by 10{--}14{\%} over the best baselines. It also achieves up to 6{\%} higher bias detection performance across three distinct bias types on a curated HQH benchmark and exhibits minimal degradation ({\ensuremath{<}}0.03{\%}) across partial and perturbed contexts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samyoun-etal-2026-attribution">
<titleInfo>
<title>Attribution-Guided Multi-Object Hallucination and Bias Detection in Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sirat</namePart>
<namePart type="family">Samyoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yingtai</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Vision-Language Models excel in multi-modal tasks but often hallucinate objects or exhibit linguistic bias by over-repeating object names, especially in complex multi-object scenes. Existing methods struggle with multi-object grounding because language priors frequently dominate visual evidence, causing hallucinated or biased objects to produce attention distributions or similarity scores nearly indistinguishable from those of real objects. We introduce SHAPLENS, a Shapley value–based attribution framework using Kernel SHAP and multi-layer fusion to detect hallucinated and biased objects. Evaluated on ADE and COCO datasets across four leading VLMs, SHAPLENS improves hallucination detection accuracy by 8–12% and F1 by 10–14% over the best baselines. It also achieves up to 6% higher bias detection performance across three distinct bias types on a curated HQH benchmark and exhibits minimal degradation (\ensuremath<0.03%) across partial and perturbed contexts.</abstract>
<identifier type="citekey">samyoun-etal-2026-attribution</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.210/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>4529</start>
<end>4548</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Attribution-Guided Multi-Object Hallucination and Bias Detection in Vision-Language Models
%A Samyoun, Sirat
%A Xiao, Yingtai
%A Du, Jian
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F samyoun-etal-2026-attribution
%X Vision-Language Models excel in multi-modal tasks but often hallucinate objects or exhibit linguistic bias by over-repeating object names, especially in complex multi-object scenes. Existing methods struggle with multi-object grounding because language priors frequently dominate visual evidence, causing hallucinated or biased objects to produce attention distributions or similarity scores nearly indistinguishable from those of real objects. We introduce SHAPLENS, a Shapley value–based attribution framework using Kernel SHAP and multi-layer fusion to detect hallucinated and biased objects. Evaluated on ADE and COCO datasets across four leading VLMs, SHAPLENS improves hallucination detection accuracy by 8–12% and F1 by 10–14% over the best baselines. It also achieves up to 6% higher bias detection performance across three distinct bias types on a curated HQH benchmark and exhibits minimal degradation (\ensuremath<0.03%) across partial and perturbed contexts.
%U https://aclanthology.org/2026.eacl-long.210/
%P 4529-4548
Markdown (Informal)
[Attribution-Guided Multi-Object Hallucination and Bias Detection in Vision-Language Models](https://aclanthology.org/2026.eacl-long.210/) (Samyoun et al., EACL 2026)
ACL