@inproceedings{phasook-etal-2025-vereafine,
title = "{V}e{R}ea{F}ine: Iterative Verification Reasoning Refinement {RAG} for Hallucination-Resistant on Open-Ended Clinical {QA}",
author = "Phasook, Pakawat and
Pitijaroonpong, Rapepong and
Kinchagawat, Jiramet and
Chinkamol, Amrest and
Saengja, Tossaporn and
Udomlapsakul, Kiartnarin and
Sawatphol, Jitkapat and
Ittichaiwong, Piyalitt",
editor = "Soni, Sarvesh and
Demner-Fushman, Dina",
booktitle = "Proceedings of the 24th Workshop on Biomedical Language Processing (Shared Tasks)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bionlp-share.34/",
doi = "10.18653/v1/2025.bionlp-share.34",
pages = "281--288",
ISBN = "979-8-89176-276-3",
abstract = "We present VeReaFine, a novel ``Verifier-RAG'' pipeline designed to eliminate hallucinations in open-ended clinical question answering. VeReaFine interleaves three tightly coupled stages{---}retrieval, verification, and generation{---}across up to three iterations. First, a two-stage dense retriever (BM-Retriever-410M {\textrightarrow} BM-Reranker-2B) fetches and ranks top-k biomedical passages; an 8B-parameter MedReason verifier then filters these for direct relevance and identifies missing evidence. When the verifier deems the context insufficient, it formulates a focused ``feedback query'' to retrieve additional passages (bounded to prevent infinite loops). Once a minimal ground-truth context is assembled, a 7B-parameter generator (Qwen2.5-7B-Instruct) drafts an answer purely from that vetted context, and the verifier performs a final check{---}prompting the generator to refine any remaining unsupported claims. By iteratively fetching only missing facts and ensuring every assertion is evidence-backed, VeReaFine achieves monotonic factuality improvements with minimal overhead. On the BioNLP 2025 ClinIQLink ``LLM Lie-Detector'' shared task, our 7B generator augmented with VeReaFine matches or surpasses a 32B medical model on open-ended reasoning metrics, reducing multi-hop inverse step-identification errors by 26{\%}. These findings demonstrate that moderate-size LLMs, when guided by targeted verification loops, can deliver expert-level reliability in clinical QA."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="phasook-etal-2025-vereafine">
<titleInfo>
<title>VeReaFine: Iterative Verification Reasoning Refinement RAG for Hallucination-Resistant on Open-Ended Clinical QA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pakawat</namePart>
<namePart type="family">Phasook</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rapepong</namePart>
<namePart type="family">Pitijaroonpong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiramet</namePart>
<namePart type="family">Kinchagawat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amrest</namePart>
<namePart type="family">Chinkamol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tossaporn</namePart>
<namePart type="family">Saengja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiartnarin</namePart>
<namePart type="family">Udomlapsakul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jitkapat</namePart>
<namePart type="family">Sawatphol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piyalitt</namePart>
<namePart type="family">Ittichaiwong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Workshop on Biomedical Language Processing (Shared Tasks)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarvesh</namePart>
<namePart type="family">Soni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-276-3</identifier>
</relatedItem>
<abstract>We present VeReaFine, a novel “Verifier-RAG” pipeline designed to eliminate hallucinations in open-ended clinical question answering. VeReaFine interleaves three tightly coupled stages—retrieval, verification, and generation—across up to three iterations. First, a two-stage dense retriever (BM-Retriever-410M → BM-Reranker-2B) fetches and ranks top-k biomedical passages; an 8B-parameter MedReason verifier then filters these for direct relevance and identifies missing evidence. When the verifier deems the context insufficient, it formulates a focused “feedback query” to retrieve additional passages (bounded to prevent infinite loops). Once a minimal ground-truth context is assembled, a 7B-parameter generator (Qwen2.5-7B-Instruct) drafts an answer purely from that vetted context, and the verifier performs a final check—prompting the generator to refine any remaining unsupported claims. By iteratively fetching only missing facts and ensuring every assertion is evidence-backed, VeReaFine achieves monotonic factuality improvements with minimal overhead. On the BioNLP 2025 ClinIQLink “LLM Lie-Detector” shared task, our 7B generator augmented with VeReaFine matches or surpasses a 32B medical model on open-ended reasoning metrics, reducing multi-hop inverse step-identification errors by 26%. These findings demonstrate that moderate-size LLMs, when guided by targeted verification loops, can deliver expert-level reliability in clinical QA.</abstract>
<identifier type="citekey">phasook-etal-2025-vereafine</identifier>
<identifier type="doi">10.18653/v1/2025.bionlp-share.34</identifier>
<location>
<url>https://aclanthology.org/2025.bionlp-share.34/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>281</start>
<end>288</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VeReaFine: Iterative Verification Reasoning Refinement RAG for Hallucination-Resistant on Open-Ended Clinical QA
%A Phasook, Pakawat
%A Pitijaroonpong, Rapepong
%A Kinchagawat, Jiramet
%A Chinkamol, Amrest
%A Saengja, Tossaporn
%A Udomlapsakul, Kiartnarin
%A Sawatphol, Jitkapat
%A Ittichaiwong, Piyalitt
%Y Soni, Sarvesh
%Y Demner-Fushman, Dina
%S Proceedings of the 24th Workshop on Biomedical Language Processing (Shared Tasks)
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-276-3
%F phasook-etal-2025-vereafine
%X We present VeReaFine, a novel “Verifier-RAG” pipeline designed to eliminate hallucinations in open-ended clinical question answering. VeReaFine interleaves three tightly coupled stages—retrieval, verification, and generation—across up to three iterations. First, a two-stage dense retriever (BM-Retriever-410M → BM-Reranker-2B) fetches and ranks top-k biomedical passages; an 8B-parameter MedReason verifier then filters these for direct relevance and identifies missing evidence. When the verifier deems the context insufficient, it formulates a focused “feedback query” to retrieve additional passages (bounded to prevent infinite loops). Once a minimal ground-truth context is assembled, a 7B-parameter generator (Qwen2.5-7B-Instruct) drafts an answer purely from that vetted context, and the verifier performs a final check—prompting the generator to refine any remaining unsupported claims. By iteratively fetching only missing facts and ensuring every assertion is evidence-backed, VeReaFine achieves monotonic factuality improvements with minimal overhead. On the BioNLP 2025 ClinIQLink “LLM Lie-Detector” shared task, our 7B generator augmented with VeReaFine matches or surpasses a 32B medical model on open-ended reasoning metrics, reducing multi-hop inverse step-identification errors by 26%. These findings demonstrate that moderate-size LLMs, when guided by targeted verification loops, can deliver expert-level reliability in clinical QA.
%R 10.18653/v1/2025.bionlp-share.34
%U https://aclanthology.org/2025.bionlp-share.34/
%U https://doi.org/10.18653/v1/2025.bionlp-share.34
%P 281-288
Markdown (Informal)
[VeReaFine: Iterative Verification Reasoning Refinement RAG for Hallucination-Resistant on Open-Ended Clinical QA](https://aclanthology.org/2025.bionlp-share.34/) (Phasook et al., BioNLP 2025)
ACL
- Pakawat Phasook, Rapepong Pitijaroonpong, Jiramet Kinchagawat, Amrest Chinkamol, Tossaporn Saengja, Kiartnarin Udomlapsakul, Jitkapat Sawatphol, and Piyalitt Ittichaiwong. 2025. VeReaFine: Iterative Verification Reasoning Refinement RAG for Hallucination-Resistant on Open-Ended Clinical QA. In Proceedings of the 24th Workshop on Biomedical Language Processing (Shared Tasks), pages 281–288, Vienna, Austria. Association for Computational Linguistics.