@inproceedings{azeez-etal-2025-truth,
title = "Truth, Trust, and Trouble: Medical {AI} on the Edge",
author = "Azeez, Mohammad Anas and
Ali, Rafiq and
Shabbir, Ebad and
Siddiqui, Zohaib Hasan and
Kashyap, Gautam Siddharth and
Gao, Jiechao and
Naseem, Usman",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.69/",
pages = "1017--1025",
ISBN = "979-8-89176-333-3",
abstract = "Large Language Models (LLMs) hold significant promise for transforming digital health by enabling automated medical question answering. However, ensuring these models meet critical industry standards for factual accuracy, usefulness, and safety remains a challenge, especially for open-source solutions. We present a rigorous benchmarking framework via a dataset of over 1,000 health questions. We assess model performance across honesty, helpfulness, and harmlessness. Our results highlight trade-offs between factual reliability and safety among evaluated models{---}Mistral-7B, BioMistral-7B-DARE, and AlpaCare-13B. AlpaCare-13B achieves the highest accuracy (91.7{\%}) and harmlessness (0.92), while domain-specific tuning in BioMistral-7B-DARE boosts safety (0.90) despite smaller scale. Few-shot prompting improves accuracy from 78{\%} to 85{\%}, and all models show reduced helpfulness on complex queries, highlighting challenges in clinical QA. Our code is available at: https://github.com/AnasAzeez/TTT"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="azeez-etal-2025-truth">
<titleInfo>
<title>Truth, Trust, and Trouble: Medical AI on the Edge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Anas</namePart>
<namePart type="family">Azeez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafiq</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ebad</namePart>
<namePart type="family">Shabbir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zohaib</namePart>
<namePart type="given">Hasan</namePart>
<namePart type="family">Siddiqui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gautam</namePart>
<namePart type="given">Siddharth</namePart>
<namePart type="family">Kashyap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiechao</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Usman</namePart>
<namePart type="family">Naseem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) hold significant promise for transforming digital health by enabling automated medical question answering. However, ensuring these models meet critical industry standards for factual accuracy, usefulness, and safety remains a challenge, especially for open-source solutions. We present a rigorous benchmarking framework via a dataset of over 1,000 health questions. We assess model performance across honesty, helpfulness, and harmlessness. Our results highlight trade-offs between factual reliability and safety among evaluated models—Mistral-7B, BioMistral-7B-DARE, and AlpaCare-13B. AlpaCare-13B achieves the highest accuracy (91.7%) and harmlessness (0.92), while domain-specific tuning in BioMistral-7B-DARE boosts safety (0.90) despite smaller scale. Few-shot prompting improves accuracy from 78% to 85%, and all models show reduced helpfulness on complex queries, highlighting challenges in clinical QA. Our code is available at: https://github.com/AnasAzeez/TTT</abstract>
<identifier type="citekey">azeez-etal-2025-truth</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.69/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1017</start>
<end>1025</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Truth, Trust, and Trouble: Medical AI on the Edge
%A Azeez, Mohammad Anas
%A Ali, Rafiq
%A Shabbir, Ebad
%A Siddiqui, Zohaib Hasan
%A Kashyap, Gautam Siddharth
%A Gao, Jiechao
%A Naseem, Usman
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F azeez-etal-2025-truth
%X Large Language Models (LLMs) hold significant promise for transforming digital health by enabling automated medical question answering. However, ensuring these models meet critical industry standards for factual accuracy, usefulness, and safety remains a challenge, especially for open-source solutions. We present a rigorous benchmarking framework via a dataset of over 1,000 health questions. We assess model performance across honesty, helpfulness, and harmlessness. Our results highlight trade-offs between factual reliability and safety among evaluated models—Mistral-7B, BioMistral-7B-DARE, and AlpaCare-13B. AlpaCare-13B achieves the highest accuracy (91.7%) and harmlessness (0.92), while domain-specific tuning in BioMistral-7B-DARE boosts safety (0.90) despite smaller scale. Few-shot prompting improves accuracy from 78% to 85%, and all models show reduced helpfulness on complex queries, highlighting challenges in clinical QA. Our code is available at: https://github.com/AnasAzeez/TTT
%U https://aclanthology.org/2025.emnlp-industry.69/
%P 1017-1025
Markdown (Informal)
[Truth, Trust, and Trouble: Medical AI on the Edge](https://aclanthology.org/2025.emnlp-industry.69/) (Azeez et al., EMNLP 2025)
ACL
- Mohammad Anas Azeez, Rafiq Ali, Ebad Shabbir, Zohaib Hasan Siddiqui, Gautam Siddharth Kashyap, Jiechao Gao, and Usman Naseem. 2025. Truth, Trust, and Trouble: Medical AI on the Edge. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1017–1025, Suzhou (China). Association for Computational Linguistics.