@inproceedings{ma-etal-2026-vericite,
title = "{VERICITE}: Evaluating Sentence-Level Citation Faithfulness in Retrieval-Augmented Medical Question Answering",
author = "Ma, Yixian and
Chu, Bohao and
Fuhr, Norbert",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.62/",
pages = "753--759",
ISBN = "979-8-89176-434-7",
abstract = "Retrieval-augmented generation (RAG) reduces hallucination in large language models by grounding outputs in retrieved evidence, but it does not guarantee that the resulting citations support the associated claims. We present VERICITE, a framework for evaluating citation faithfulness in retrieval-augmented medical QA. Our system retrieves PubMed abstracts via the NCBI E-Utilities API, prompts LLMs to generate answers with inline citations, and verifies each citation at the sentence level using a DeBERTa-v3-large NLI model. We evaluate four LLMs on 500 BioASQ questions at retrieval depths of 3 and 5, with extended experiments up to k = 15 and an oracle setting with gold standard documents. Only 27?41{\%} of citation pairs are supported at the sentence level at retrieval depths of 3 and 5, with support rates declining further at larger k. Under the oracle condition, answer quality improves, but citation faithfulness does not substantially improve, suggesting that generation-side citation behavior contributes substantially to unfaithful citations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ma-etal-2026-vericite">
<titleInfo>
<title>VERICITE: Evaluating Sentence-Level Citation Faithfulness in Retrieval-Augmented Medical Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yixian</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bohao</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Norbert</namePart>
<namePart type="family">Fuhr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Retrieval-augmented generation (RAG) reduces hallucination in large language models by grounding outputs in retrieved evidence, but it does not guarantee that the resulting citations support the associated claims. We present VERICITE, a framework for evaluating citation faithfulness in retrieval-augmented medical QA. Our system retrieves PubMed abstracts via the NCBI E-Utilities API, prompts LLMs to generate answers with inline citations, and verifies each citation at the sentence level using a DeBERTa-v3-large NLI model. We evaluate four LLMs on 500 BioASQ questions at retrieval depths of 3 and 5, with extended experiments up to k = 15 and an oracle setting with gold standard documents. Only 27?41% of citation pairs are supported at the sentence level at retrieval depths of 3 and 5, with support rates declining further at larger k. Under the oracle condition, answer quality improves, but citation faithfulness does not substantially improve, suggesting that generation-side citation behavior contributes substantially to unfaithful citations.</abstract>
<identifier type="citekey">ma-etal-2026-vericite</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.62/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>753</start>
<end>759</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VERICITE: Evaluating Sentence-Level Citation Faithfulness in Retrieval-Augmented Medical Question Answering
%A Ma, Yixian
%A Chu, Bohao
%A Fuhr, Norbert
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F ma-etal-2026-vericite
%X Retrieval-augmented generation (RAG) reduces hallucination in large language models by grounding outputs in retrieved evidence, but it does not guarantee that the resulting citations support the associated claims. We present VERICITE, a framework for evaluating citation faithfulness in retrieval-augmented medical QA. Our system retrieves PubMed abstracts via the NCBI E-Utilities API, prompts LLMs to generate answers with inline citations, and verifies each citation at the sentence level using a DeBERTa-v3-large NLI model. We evaluate four LLMs on 500 BioASQ questions at retrieval depths of 3 and 5, with extended experiments up to k = 15 and an oracle setting with gold standard documents. Only 27?41% of citation pairs are supported at the sentence level at retrieval depths of 3 and 5, with support rates declining further at larger k. Under the oracle condition, answer quality improves, but citation faithfulness does not substantially improve, suggesting that generation-side citation behavior contributes substantially to unfaithful citations.
%U https://aclanthology.org/2026.bionlp-1.62/
%P 753-759
Markdown (Informal)
[VERICITE: Evaluating Sentence-Level Citation Faithfulness in Retrieval-Augmented Medical Question Answering](https://aclanthology.org/2026.bionlp-1.62/) (Ma et al., BioNLP 2026)
ACL