@inproceedings{hasan-etal-2026-beyond,
title = "Beyond Lexical Similarity: Evaluating Faithfulness in {LLM}-Based Medical Question Reformulation",
author = "Hasan, Md Rabiul and
Ayalew, Aleka Melese and
Oussalah, Mourad",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.smm4h-1.16/",
pages = "93--102",
ISBN = "979-8-89176-432-3",
abstract = "Medical query rewriting transforms verbose consumer health questions into concise clinical queries, a critical step in health information retrieval. Large language models (LLMs) perform well on this task by standard metrics, yet high ROUGE or BERTScore does not guarantee preservation of clinical content. To address this issue, we introduce MedFaith-F1, a category-level faithfulness metric over four clinically salient categories: diagnoses, medications, procedures, and follow-up intent. We further propose a hybrid Evidence and Knowledge-Grounded Retrieval-Augmented Generation EKG-RAG, an evidence and knowledge-grounded framework combining hybrid retrieval over PubMed and MedlinePlus resources with UMLS (Unified Medical Language System)-aligned ontology grounding. Evaluating large language models LLaMA-3 and Qwen2.5 across zero-shot, few-shot, and QLoRA settings on MeQSum and medical question-pair (MQP) datasets revealed that base models exhibit category-level hallucination rates exceeding 40{\%}, invisible to standard metrics, while EKG-RAG with QLoRA reduces this rate to 26.75{\%}, achieving MedFaith-F1 of 0.73. Our findings call for faithfulness-aware evaluation in clinical query rewriting, and MedFaith-F1 provides a reproducible step in that direction."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hasan-etal-2026-beyond">
<titleInfo>
<title>Beyond Lexical Similarity: Evaluating Faithfulness in LLM-Based Medical Question Reformulation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Rabiul</namePart>
<namePart type="family">Hasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleka</namePart>
<namePart type="given">Melese</namePart>
<namePart type="family">Ayalew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mourad</namePart>
<namePart type="family">Oussalah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Lopez-Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graciela</namePart>
<namePart type="family">Gonzalez-Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-432-3</identifier>
</relatedItem>
<abstract>Medical query rewriting transforms verbose consumer health questions into concise clinical queries, a critical step in health information retrieval. Large language models (LLMs) perform well on this task by standard metrics, yet high ROUGE or BERTScore does not guarantee preservation of clinical content. To address this issue, we introduce MedFaith-F1, a category-level faithfulness metric over four clinically salient categories: diagnoses, medications, procedures, and follow-up intent. We further propose a hybrid Evidence and Knowledge-Grounded Retrieval-Augmented Generation EKG-RAG, an evidence and knowledge-grounded framework combining hybrid retrieval over PubMed and MedlinePlus resources with UMLS (Unified Medical Language System)-aligned ontology grounding. Evaluating large language models LLaMA-3 and Qwen2.5 across zero-shot, few-shot, and QLoRA settings on MeQSum and medical question-pair (MQP) datasets revealed that base models exhibit category-level hallucination rates exceeding 40%, invisible to standard metrics, while EKG-RAG with QLoRA reduces this rate to 26.75%, achieving MedFaith-F1 of 0.73. Our findings call for faithfulness-aware evaluation in clinical query rewriting, and MedFaith-F1 provides a reproducible step in that direction.</abstract>
<identifier type="citekey">hasan-etal-2026-beyond</identifier>
<location>
<url>https://aclanthology.org/2026.smm4h-1.16/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>93</start>
<end>102</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Lexical Similarity: Evaluating Faithfulness in LLM-Based Medical Question Reformulation
%A Hasan, Md Rabiul
%A Ayalew, Aleka Melese
%A Oussalah, Mourad
%Y Lopez-Garcia, Guillermo
%Y Gonzalez-Hernandez, Graciela
%S Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-432-3
%F hasan-etal-2026-beyond
%X Medical query rewriting transforms verbose consumer health questions into concise clinical queries, a critical step in health information retrieval. Large language models (LLMs) perform well on this task by standard metrics, yet high ROUGE or BERTScore does not guarantee preservation of clinical content. To address this issue, we introduce MedFaith-F1, a category-level faithfulness metric over four clinically salient categories: diagnoses, medications, procedures, and follow-up intent. We further propose a hybrid Evidence and Knowledge-Grounded Retrieval-Augmented Generation EKG-RAG, an evidence and knowledge-grounded framework combining hybrid retrieval over PubMed and MedlinePlus resources with UMLS (Unified Medical Language System)-aligned ontology grounding. Evaluating large language models LLaMA-3 and Qwen2.5 across zero-shot, few-shot, and QLoRA settings on MeQSum and medical question-pair (MQP) datasets revealed that base models exhibit category-level hallucination rates exceeding 40%, invisible to standard metrics, while EKG-RAG with QLoRA reduces this rate to 26.75%, achieving MedFaith-F1 of 0.73. Our findings call for faithfulness-aware evaluation in clinical query rewriting, and MedFaith-F1 provides a reproducible step in that direction.
%U https://aclanthology.org/2026.smm4h-1.16/
%P 93-102
Markdown (Informal)
[Beyond Lexical Similarity: Evaluating Faithfulness in LLM-Based Medical Question Reformulation](https://aclanthology.org/2026.smm4h-1.16/) (Hasan et al., SMM4H 2026)
ACL