@inproceedings{cheng-etal-2024-dont,
title = "Don{'}t be my Doctor! Recognizing Healthcare Advice in Large Language Models",
author = "Cheng, Kellen Tan and
Gentile, Anna Lisa and
Li, Pengyuan and
DeLuca, Chad and
Ren, Guang-Jie",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.72",
pages = "970--980",
abstract = "Large language models (LLMs) have seen increasing popularity in daily use, with their widespread adoption by many corporations as virtual assistants, chatbots, predictors, and many more. Their growing influence raises the need for safeguards and guardrails to ensure that the outputs from LLMs do not mislead or harm users. This is especially true for highly regulated domains such as healthcare, where misleading advice may influence users to unknowingly commit malpractice. Despite this vulnerability, the majority of guardrail benchmarking datasets do not focus enough on medical advice specifically. In this paper, we present the HeAL benchmark (HEalth Advice in LLMs), a health-advice benchmark dataset that has been manually curated and annotated to evaluate LLMs{'} capability in recognizing health-advice - which we use to safeguard LLMs deployed in industrial settings. We use HeAL to assess several models and report a detailed analysis of the findings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cheng-etal-2024-dont">
<titleInfo>
<title>Don’t be my Doctor! Recognizing Healthcare Advice in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kellen</namePart>
<namePart type="given">Tan</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="given">Lisa</namePart>
<namePart type="family">Gentile</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengyuan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chad</namePart>
<namePart type="family">DeLuca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guang-Jie</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models (LLMs) have seen increasing popularity in daily use, with their widespread adoption by many corporations as virtual assistants, chatbots, predictors, and many more. Their growing influence raises the need for safeguards and guardrails to ensure that the outputs from LLMs do not mislead or harm users. This is especially true for highly regulated domains such as healthcare, where misleading advice may influence users to unknowingly commit malpractice. Despite this vulnerability, the majority of guardrail benchmarking datasets do not focus enough on medical advice specifically. In this paper, we present the HeAL benchmark (HEalth Advice in LLMs), a health-advice benchmark dataset that has been manually curated and annotated to evaluate LLMs’ capability in recognizing health-advice - which we use to safeguard LLMs deployed in industrial settings. We use HeAL to assess several models and report a detailed analysis of the findings.</abstract>
<identifier type="citekey">cheng-etal-2024-dont</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.72</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>970</start>
<end>980</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Don’t be my Doctor! Recognizing Healthcare Advice in Large Language Models
%A Cheng, Kellen Tan
%A Gentile, Anna Lisa
%A Li, Pengyuan
%A DeLuca, Chad
%A Ren, Guang-Jie
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F cheng-etal-2024-dont
%X Large language models (LLMs) have seen increasing popularity in daily use, with their widespread adoption by many corporations as virtual assistants, chatbots, predictors, and many more. Their growing influence raises the need for safeguards and guardrails to ensure that the outputs from LLMs do not mislead or harm users. This is especially true for highly regulated domains such as healthcare, where misleading advice may influence users to unknowingly commit malpractice. Despite this vulnerability, the majority of guardrail benchmarking datasets do not focus enough on medical advice specifically. In this paper, we present the HeAL benchmark (HEalth Advice in LLMs), a health-advice benchmark dataset that has been manually curated and annotated to evaluate LLMs’ capability in recognizing health-advice - which we use to safeguard LLMs deployed in industrial settings. We use HeAL to assess several models and report a detailed analysis of the findings.
%U https://aclanthology.org/2024.emnlp-industry.72
%P 970-980
Markdown (Informal)
[Don’t be my Doctor! Recognizing Healthcare Advice in Large Language Models](https://aclanthology.org/2024.emnlp-industry.72) (Cheng et al., EMNLP 2024)
ACL