@inproceedings{toma-etal-2024-wanglab-mediqa,
title = "{W}ang{L}ab at {MEDIQA}-{M}3{G} 2024: Multimodal Medical Answer Generation using Large Language Models",
author = "Xie, Ronald and
Palayew, Steven and
Toma, Augustin and
Bader, Gary and
Wang, Bo",
editor = "Naumann, Tristan and
Ben Abacha, Asma and
Bethard, Steven and
Roberts, Kirk and
Bitterman, Danielle",
booktitle = "Proceedings of the 6th Clinical Natural Language Processing Workshop",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.clinicalnlp-1.60/",
doi = "10.18653/v1/2024.clinicalnlp-1.60",
pages = "624--634",
abstract = "This paper outlines our submission to the MEDIQA2024 Multilingual and Multimodal Medical Answer Generation (M3G) shared task. We report results for two standalone solutions under the English category of the task, the first involving two consecutive API calls to the Claude 3 Opus API and the second involving training an image-disease label joint embedding in the style of CLIP for image classification. These two solutions scored 1st and 2nd place respectively on the competition leaderboard, substantially outperforming the next best solution. Additionally, we discuss insights gained from post-competition experiments. While the performance of these two described solutions have significant room for improvement due to the difficulty of the shared task and the challenging nature of medical visual question answering in general, we identify the multi-stage LLM approach and the CLIP image classification approach as promising avenues for further investigation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toma-etal-2024-wanglab-mediqa">
<titleInfo>
<title>WangLab at MEDIQA-M3G 2024: Multimodal Medical Answer Generation using Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ronald</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Palayew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Augustin</namePart>
<namePart type="family">Toma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gary</namePart>
<namePart type="family">Bader</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Clinical Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Naumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asma</namePart>
<namePart type="family">Ben Abacha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danielle</namePart>
<namePart type="family">Bitterman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper outlines our submission to the MEDIQA2024 Multilingual and Multimodal Medical Answer Generation (M3G) shared task. We report results for two standalone solutions under the English category of the task, the first involving two consecutive API calls to the Claude 3 Opus API and the second involving training an image-disease label joint embedding in the style of CLIP for image classification. These two solutions scored 1st and 2nd place respectively on the competition leaderboard, substantially outperforming the next best solution. Additionally, we discuss insights gained from post-competition experiments. While the performance of these two described solutions have significant room for improvement due to the difficulty of the shared task and the challenging nature of medical visual question answering in general, we identify the multi-stage LLM approach and the CLIP image classification approach as promising avenues for further investigation.</abstract>
<identifier type="citekey">toma-etal-2024-wanglab-mediqa</identifier>
<identifier type="doi">10.18653/v1/2024.clinicalnlp-1.60</identifier>
<location>
<url>https://aclanthology.org/2024.clinicalnlp-1.60/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>624</start>
<end>634</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WangLab at MEDIQA-M3G 2024: Multimodal Medical Answer Generation using Large Language Models
%A Xie, Ronald
%A Palayew, Steven
%A Toma, Augustin
%A Bader, Gary
%A Wang, Bo
%Y Naumann, Tristan
%Y Ben Abacha, Asma
%Y Bethard, Steven
%Y Roberts, Kirk
%Y Bitterman, Danielle
%S Proceedings of the 6th Clinical Natural Language Processing Workshop
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F toma-etal-2024-wanglab-mediqa
%X This paper outlines our submission to the MEDIQA2024 Multilingual and Multimodal Medical Answer Generation (M3G) shared task. We report results for two standalone solutions under the English category of the task, the first involving two consecutive API calls to the Claude 3 Opus API and the second involving training an image-disease label joint embedding in the style of CLIP for image classification. These two solutions scored 1st and 2nd place respectively on the competition leaderboard, substantially outperforming the next best solution. Additionally, we discuss insights gained from post-competition experiments. While the performance of these two described solutions have significant room for improvement due to the difficulty of the shared task and the challenging nature of medical visual question answering in general, we identify the multi-stage LLM approach and the CLIP image classification approach as promising avenues for further investigation.
%R 10.18653/v1/2024.clinicalnlp-1.60
%U https://aclanthology.org/2024.clinicalnlp-1.60/
%U https://doi.org/10.18653/v1/2024.clinicalnlp-1.60
%P 624-634
Markdown (Informal)
[WangLab at MEDIQA-M3G 2024: Multimodal Medical Answer Generation using Large Language Models](https://aclanthology.org/2024.clinicalnlp-1.60/) (Xie et al., ClinicalNLP 2024)
ACL