@inproceedings{han-etal-2026-evidence,
title = "When Evidence Conflicts: Uncertainty and Order Effects in Retrieval-Augmented Biomedical Question Answering",
author = "Han, Yikun and
Lan, Mengfei and
Kilicoglu, Halil",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.50/",
pages = "630--643",
ISBN = "979-8-89176-434-7",
abstract = "Biomedical retrieval-augmented LLMs are often evaluated under helpful retrieved context, but in practice the evidence can also be misleading or internally conflicting. This paper studies uncertainty under those harder settings using the HealthContradict benchmark and six open-weight models. We evaluate five controlled evidence conditions: no context, correct-only context, incorrect-only context, and two mixed conditions that contain the same correct and contradictory documents in opposite orders. Correct evidence improves both accuracy and calibration, while incorrect evidence substantially degrades both. Under conflicting evidence, document order also matters: reversing the order of the same two documents changes 11.4{\%}{--}25.2{\%} of predictions and consistently reduces performance when the incorrect document appears first. We further evaluate a conflict-aware abstention score that combines model confidence with a detector of evidence conflict. In the two hardest conditions, incorrect-only and incorrect-first conflict, this score improves selective accuracy over confidence-only abstention, with mean gains of 7.2{--}33.4 and 3.6{--}14.4 points across 75{\%}, 50{\%}, and 25{\%} coverage. These results show that biomedical RAG systems should be evaluated not only under helpful retrieval, but also under misleading and conflicting evidence."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="han-etal-2026-evidence">
<titleInfo>
<title>When Evidence Conflicts: Uncertainty and Order Effects in Retrieval-Augmented Biomedical Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yikun</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mengfei</namePart>
<namePart type="family">Lan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Halil</namePart>
<namePart type="family">Kilicoglu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Biomedical retrieval-augmented LLMs are often evaluated under helpful retrieved context, but in practice the evidence can also be misleading or internally conflicting. This paper studies uncertainty under those harder settings using the HealthContradict benchmark and six open-weight models. We evaluate five controlled evidence conditions: no context, correct-only context, incorrect-only context, and two mixed conditions that contain the same correct and contradictory documents in opposite orders. Correct evidence improves both accuracy and calibration, while incorrect evidence substantially degrades both. Under conflicting evidence, document order also matters: reversing the order of the same two documents changes 11.4%–25.2% of predictions and consistently reduces performance when the incorrect document appears first. We further evaluate a conflict-aware abstention score that combines model confidence with a detector of evidence conflict. In the two hardest conditions, incorrect-only and incorrect-first conflict, this score improves selective accuracy over confidence-only abstention, with mean gains of 7.2–33.4 and 3.6–14.4 points across 75%, 50%, and 25% coverage. These results show that biomedical RAG systems should be evaluated not only under helpful retrieval, but also under misleading and conflicting evidence.</abstract>
<identifier type="citekey">han-etal-2026-evidence</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.50/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>630</start>
<end>643</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When Evidence Conflicts: Uncertainty and Order Effects in Retrieval-Augmented Biomedical Question Answering
%A Han, Yikun
%A Lan, Mengfei
%A Kilicoglu, Halil
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F han-etal-2026-evidence
%X Biomedical retrieval-augmented LLMs are often evaluated under helpful retrieved context, but in practice the evidence can also be misleading or internally conflicting. This paper studies uncertainty under those harder settings using the HealthContradict benchmark and six open-weight models. We evaluate five controlled evidence conditions: no context, correct-only context, incorrect-only context, and two mixed conditions that contain the same correct and contradictory documents in opposite orders. Correct evidence improves both accuracy and calibration, while incorrect evidence substantially degrades both. Under conflicting evidence, document order also matters: reversing the order of the same two documents changes 11.4%–25.2% of predictions and consistently reduces performance when the incorrect document appears first. We further evaluate a conflict-aware abstention score that combines model confidence with a detector of evidence conflict. In the two hardest conditions, incorrect-only and incorrect-first conflict, this score improves selective accuracy over confidence-only abstention, with mean gains of 7.2–33.4 and 3.6–14.4 points across 75%, 50%, and 25% coverage. These results show that biomedical RAG systems should be evaluated not only under helpful retrieval, but also under misleading and conflicting evidence.
%U https://aclanthology.org/2026.bionlp-1.50/
%P 630-643
Markdown (Informal)
[When Evidence Conflicts: Uncertainty and Order Effects in Retrieval-Augmented Biomedical Question Answering](https://aclanthology.org/2026.bionlp-1.50/) (Han et al., BioNLP 2026)
ACL