@inproceedings{pal-sankarasubbu-2024-gemini,
title = "Gemini Goes to {M}ed School: Exploring the Capabilities of Multimodal Large Language Models on Medical Challenge Problems {\&} Hallucinations",
author = "Pal, Ankit and
Sankarasubbu, Malaikannan",
editor = "Naumann, Tristan and
Ben Abacha, Asma and
Bethard, Steven and
Roberts, Kirk and
Bitterman, Danielle",
booktitle = "Proceedings of the 6th Clinical Natural Language Processing Workshop",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.clinicalnlp-1.3",
doi = "10.18653/v1/2024.clinicalnlp-1.3",
pages = "21--46",
abstract = "Large language models have the potential to be valuable in the healthcare industry, but it{'}s crucial to verify their safety and effectiveness through rigorous evaluation. In our study, we evaluated LLMs, including Google{'}s Gemini, across various medical tasks. Despite Gemini{'}s capabilities, it underperformed compared to leading models like MedPaLM 2 and GPT-4, particularly in medical visual question answering (VQA), with a notable accuracy gap (Gemini at 61.45{\%} vs. GPT-4V at 88{\%}). Our analysis revealed that Gemini is highly susceptible to hallucinations, overconfidence, and knowledge gaps, which indicate risks if deployed uncritically. We also performed a detailed analysis by medical subject and test type, providing actionable feedback for developers and clinicians. To mitigate risks, we implemented effective prompting strategies, improving performance, and contributed to the field by releasing a Python module for medical LLM evaluation and establishing a leaderboard on Hugging Face for ongoing research and development. Python module can be found at https://github.com/promptslab/RosettaEval",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pal-sankarasubbu-2024-gemini">
<titleInfo>
<title>Gemini Goes to Med School: Exploring the Capabilities of Multimodal Large Language Models on Medical Challenge Problems & Hallucinations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ankit</namePart>
<namePart type="family">Pal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malaikannan</namePart>
<namePart type="family">Sankarasubbu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Clinical Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Naumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asma</namePart>
<namePart type="family">Ben Abacha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danielle</namePart>
<namePart type="family">Bitterman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large language models have the potential to be valuable in the healthcare industry, but it’s crucial to verify their safety and effectiveness through rigorous evaluation. In our study, we evaluated LLMs, including Google’s Gemini, across various medical tasks. Despite Gemini’s capabilities, it underperformed compared to leading models like MedPaLM 2 and GPT-4, particularly in medical visual question answering (VQA), with a notable accuracy gap (Gemini at 61.45% vs. GPT-4V at 88%). Our analysis revealed that Gemini is highly susceptible to hallucinations, overconfidence, and knowledge gaps, which indicate risks if deployed uncritically. We also performed a detailed analysis by medical subject and test type, providing actionable feedback for developers and clinicians. To mitigate risks, we implemented effective prompting strategies, improving performance, and contributed to the field by releasing a Python module for medical LLM evaluation and establishing a leaderboard on Hugging Face for ongoing research and development. Python module can be found at https://github.com/promptslab/RosettaEval</abstract>
<identifier type="citekey">pal-sankarasubbu-2024-gemini</identifier>
<identifier type="doi">10.18653/v1/2024.clinicalnlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2024.clinicalnlp-1.3</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>21</start>
<end>46</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Gemini Goes to Med School: Exploring the Capabilities of Multimodal Large Language Models on Medical Challenge Problems & Hallucinations
%A Pal, Ankit
%A Sankarasubbu, Malaikannan
%Y Naumann, Tristan
%Y Ben Abacha, Asma
%Y Bethard, Steven
%Y Roberts, Kirk
%Y Bitterman, Danielle
%S Proceedings of the 6th Clinical Natural Language Processing Workshop
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F pal-sankarasubbu-2024-gemini
%X Large language models have the potential to be valuable in the healthcare industry, but it’s crucial to verify their safety and effectiveness through rigorous evaluation. In our study, we evaluated LLMs, including Google’s Gemini, across various medical tasks. Despite Gemini’s capabilities, it underperformed compared to leading models like MedPaLM 2 and GPT-4, particularly in medical visual question answering (VQA), with a notable accuracy gap (Gemini at 61.45% vs. GPT-4V at 88%). Our analysis revealed that Gemini is highly susceptible to hallucinations, overconfidence, and knowledge gaps, which indicate risks if deployed uncritically. We also performed a detailed analysis by medical subject and test type, providing actionable feedback for developers and clinicians. To mitigate risks, we implemented effective prompting strategies, improving performance, and contributed to the field by releasing a Python module for medical LLM evaluation and establishing a leaderboard on Hugging Face for ongoing research and development. Python module can be found at https://github.com/promptslab/RosettaEval
%R 10.18653/v1/2024.clinicalnlp-1.3
%U https://aclanthology.org/2024.clinicalnlp-1.3
%U https://doi.org/10.18653/v1/2024.clinicalnlp-1.3
%P 21-46
Markdown (Informal)
[Gemini Goes to Med School: Exploring the Capabilities of Multimodal Large Language Models on Medical Challenge Problems & Hallucinations](https://aclanthology.org/2024.clinicalnlp-1.3) (Pal & Sankarasubbu, ClinicalNLP-WS 2024)
ACL