@inproceedings{dada-etal-2025-biomedical,
title = "Does Biomedical Training Lead to Better Medical Performance?",
author = "Dada, Amin and
Kora{\c{s}}, Osman Alperen and
Bauer, Marie and
Corbeil, Jean-Philippe and
Contreras, Amanda Butler and
Seibold, Constantin Marc and
Smith, Kaleb E and
Friedrich, Julian and
Kleesiek, Jens",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.5/",
pages = "46--59",
ISBN = "979-8-89176-261-9",
abstract = "Large Language Models (LLMs) hold significant potential for improving healthcare applications, with biomedically adapted models promising enhanced performance on medical tasks. However, the effectiveness of biomedical domain adaptation for clinical tasks remains uncertain. In this study, we conduct a direct comparison of 12 biomedically adapted models and their general-domain base counterparts across six clinical tasks. Our results reveal that 11 out of 12 biomedical models exhibit performance declines, challenging prior findings that reported positive effects of biomedical adaptation. Notably, previous positive results primarily relied on multiple-choice evaluations, which may not reflect performance in real-world clinical applications. To promote reproducibility and further research, we open-source our evaluation pipeline, providing a resource for the development of models with practical benefits in healthcare settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dada-etal-2025-biomedical">
<titleInfo>
<title>Does Biomedical Training Lead to Better Medical Performance?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amin</namePart>
<namePart type="family">Dada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Osman</namePart>
<namePart type="given">Alperen</namePart>
<namePart type="family">Koraş</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Bauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean-Philippe</namePart>
<namePart type="family">Corbeil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amanda</namePart>
<namePart type="given">Butler</namePart>
<namePart type="family">Contreras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantin</namePart>
<namePart type="given">Marc</namePart>
<namePart type="family">Seibold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaleb</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Friedrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jens</namePart>
<namePart type="family">Kleesiek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) hold significant potential for improving healthcare applications, with biomedically adapted models promising enhanced performance on medical tasks. However, the effectiveness of biomedical domain adaptation for clinical tasks remains uncertain. In this study, we conduct a direct comparison of 12 biomedically adapted models and their general-domain base counterparts across six clinical tasks. Our results reveal that 11 out of 12 biomedical models exhibit performance declines, challenging prior findings that reported positive effects of biomedical adaptation. Notably, previous positive results primarily relied on multiple-choice evaluations, which may not reflect performance in real-world clinical applications. To promote reproducibility and further research, we open-source our evaluation pipeline, providing a resource for the development of models with practical benefits in healthcare settings.</abstract>
<identifier type="citekey">dada-etal-2025-biomedical</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.5/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>46</start>
<end>59</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Does Biomedical Training Lead to Better Medical Performance?
%A Dada, Amin
%A Koraş, Osman Alperen
%A Bauer, Marie
%A Corbeil, Jean-Philippe
%A Contreras, Amanda Butler
%A Seibold, Constantin Marc
%A Smith, Kaleb E.
%A Friedrich, Julian
%A Kleesiek, Jens
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F dada-etal-2025-biomedical
%X Large Language Models (LLMs) hold significant potential for improving healthcare applications, with biomedically adapted models promising enhanced performance on medical tasks. However, the effectiveness of biomedical domain adaptation for clinical tasks remains uncertain. In this study, we conduct a direct comparison of 12 biomedically adapted models and their general-domain base counterparts across six clinical tasks. Our results reveal that 11 out of 12 biomedical models exhibit performance declines, challenging prior findings that reported positive effects of biomedical adaptation. Notably, previous positive results primarily relied on multiple-choice evaluations, which may not reflect performance in real-world clinical applications. To promote reproducibility and further research, we open-source our evaluation pipeline, providing a resource for the development of models with practical benefits in healthcare settings.
%U https://aclanthology.org/2025.gem-1.5/
%P 46-59
Markdown (Informal)
[Does Biomedical Training Lead to Better Medical Performance?](https://aclanthology.org/2025.gem-1.5/) (Dada et al., GEM 2025)
ACL
- Amin Dada, Osman Alperen Koraş, Marie Bauer, Jean-Philippe Corbeil, Amanda Butler Contreras, Constantin Marc Seibold, Kaleb E Smith, Julian Friedrich, and Jens Kleesiek. 2025. Does Biomedical Training Lead to Better Medical Performance?. In Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²), pages 46–59, Vienna, Austria and virtual meeting. Association for Computational Linguistics.