@inproceedings{gallifant-etal-2024-language,
title = "Language Models are Surprisingly Fragile to Drug Names in Biomedical Benchmarks",
author = "Gallifant, Jack and
Chen, Shan and
Moreira, Pedro Jos{\'e} Ferreira and
Munch, Nikolaj and
Gao, Mingye and
Pond, Jackson and
Celi, Leo Anthony and
Aerts, Hugo and
Hartvigsen, Thomas and
Bitterman, Danielle",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.726/",
doi = "10.18653/v1/2024.findings-emnlp.726",
pages = "12448--12465",
abstract = "Medical knowledge is context-dependent and requires consistent reasoning across various natural language expressions of semantically equivalent phrases. This is particularly crucial for drug names, where patients often use brand names like Advil or Tylenol instead of their generic equivalents. To study this, we create a new robustness dataset, \textbf{RABBITS}, to evaluate performance differences on medical benchmarks after swapping brand and generic drug names using physician expert annotations.We assess both open-source and API-based LLMs on MedQA and MedMCQA, revealing a consistent performance drop ranging from 1-10{\%}. Furthermore, we identify a potential source of this fragility as the contamination of test data in widely used pre-training datasets."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gallifant-etal-2024-language">
<titleInfo>
<title>Language Models are Surprisingly Fragile to Drug Names in Biomedical Benchmarks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Gallifant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="given">José</namePart>
<namePart type="given">Ferreira</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaj</namePart>
<namePart type="family">Munch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingye</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="family">Pond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="given">Anthony</namePart>
<namePart type="family">Celi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hugo</namePart>
<namePart type="family">Aerts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Hartvigsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danielle</namePart>
<namePart type="family">Bitterman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Medical knowledge is context-dependent and requires consistent reasoning across various natural language expressions of semantically equivalent phrases. This is particularly crucial for drug names, where patients often use brand names like Advil or Tylenol instead of their generic equivalents. To study this, we create a new robustness dataset, RABBITS, to evaluate performance differences on medical benchmarks after swapping brand and generic drug names using physician expert annotations.We assess both open-source and API-based LLMs on MedQA and MedMCQA, revealing a consistent performance drop ranging from 1-10%. Furthermore, we identify a potential source of this fragility as the contamination of test data in widely used pre-training datasets.</abstract>
<identifier type="citekey">gallifant-etal-2024-language</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.726</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.726/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>12448</start>
<end>12465</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Models are Surprisingly Fragile to Drug Names in Biomedical Benchmarks
%A Gallifant, Jack
%A Chen, Shan
%A Moreira, Pedro José Ferreira
%A Munch, Nikolaj
%A Gao, Mingye
%A Pond, Jackson
%A Celi, Leo Anthony
%A Aerts, Hugo
%A Hartvigsen, Thomas
%A Bitterman, Danielle
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F gallifant-etal-2024-language
%X Medical knowledge is context-dependent and requires consistent reasoning across various natural language expressions of semantically equivalent phrases. This is particularly crucial for drug names, where patients often use brand names like Advil or Tylenol instead of their generic equivalents. To study this, we create a new robustness dataset, RABBITS, to evaluate performance differences on medical benchmarks after swapping brand and generic drug names using physician expert annotations.We assess both open-source and API-based LLMs on MedQA and MedMCQA, revealing a consistent performance drop ranging from 1-10%. Furthermore, we identify a potential source of this fragility as the contamination of test data in widely used pre-training datasets.
%R 10.18653/v1/2024.findings-emnlp.726
%U https://aclanthology.org/2024.findings-emnlp.726/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.726
%P 12448-12465
Markdown (Informal)
[Language Models are Surprisingly Fragile to Drug Names in Biomedical Benchmarks](https://aclanthology.org/2024.findings-emnlp.726/) (Gallifant et al., Findings 2024)
ACL
- Jack Gallifant, Shan Chen, Pedro José Ferreira Moreira, Nikolaj Munch, Mingye Gao, Jackson Pond, Leo Anthony Celi, Hugo Aerts, Thomas Hartvigsen, and Danielle Bitterman. 2024. Language Models are Surprisingly Fragile to Drug Names in Biomedical Benchmarks. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 12448–12465, Miami, Florida, USA. Association for Computational Linguistics.