@inproceedings{eslami-etal-2023-pubmedclip,
title = "{P}ub{M}ed{CLIP}: How Much Does {CLIP} Benefit Visual Question Answering in the Medical Domain?",
author = "Eslami, Sedigheh and
Meinel, Christoph and
de Melo, Gerard",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.88",
doi = "10.18653/v1/2023.findings-eacl.88",
pages = "1181--1193",
abstract = "Contrastive Language{--}Image Pre-training (CLIP) has shown remarkable success in learning with cross-modal supervision from extensive amounts of image{--}text pairs collected online. Thus far, the effectiveness of CLIP has been investigated primarily in general-domain multimodal problems. In this work, we evaluate the effectiveness of CLIP for the task of Medical Visual Question Answering (MedVQA). We present PubMedCLIP, a fine-tuned version of CLIP for the medical domain based on PubMed articles. Our experiments conducted on two MedVQA benchmark datasets illustrate that PubMedCLIP achieves superior results improving the overall accuracy up to 3{\%} in comparison to the state-of-the-art Model-Agnostic Meta-Learning (MAML) networks pre-trained only on visual data. The PubMedCLIP model with different back-ends, the source code for pre-training them and reproducing our MedVQA pipeline is publicly available at \url{https://github.com/sarahESL/PubMedCLIP}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eslami-etal-2023-pubmedclip">
<titleInfo>
<title>PubMedCLIP: How Much Does CLIP Benefit Visual Question Answering in the Medical Domain?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sedigheh</namePart>
<namePart type="family">Eslami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Meinel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerard</namePart>
<namePart type="family">de Melo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Contrastive Language–Image Pre-training (CLIP) has shown remarkable success in learning with cross-modal supervision from extensive amounts of image–text pairs collected online. Thus far, the effectiveness of CLIP has been investigated primarily in general-domain multimodal problems. In this work, we evaluate the effectiveness of CLIP for the task of Medical Visual Question Answering (MedVQA). We present PubMedCLIP, a fine-tuned version of CLIP for the medical domain based on PubMed articles. Our experiments conducted on two MedVQA benchmark datasets illustrate that PubMedCLIP achieves superior results improving the overall accuracy up to 3% in comparison to the state-of-the-art Model-Agnostic Meta-Learning (MAML) networks pre-trained only on visual data. The PubMedCLIP model with different back-ends, the source code for pre-training them and reproducing our MedVQA pipeline is publicly available at https://github.com/sarahESL/PubMedCLIP.</abstract>
<identifier type="citekey">eslami-etal-2023-pubmedclip</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.88</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.88</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1181</start>
<end>1193</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PubMedCLIP: How Much Does CLIP Benefit Visual Question Answering in the Medical Domain?
%A Eslami, Sedigheh
%A Meinel, Christoph
%A de Melo, Gerard
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F eslami-etal-2023-pubmedclip
%X Contrastive Language–Image Pre-training (CLIP) has shown remarkable success in learning with cross-modal supervision from extensive amounts of image–text pairs collected online. Thus far, the effectiveness of CLIP has been investigated primarily in general-domain multimodal problems. In this work, we evaluate the effectiveness of CLIP for the task of Medical Visual Question Answering (MedVQA). We present PubMedCLIP, a fine-tuned version of CLIP for the medical domain based on PubMed articles. Our experiments conducted on two MedVQA benchmark datasets illustrate that PubMedCLIP achieves superior results improving the overall accuracy up to 3% in comparison to the state-of-the-art Model-Agnostic Meta-Learning (MAML) networks pre-trained only on visual data. The PubMedCLIP model with different back-ends, the source code for pre-training them and reproducing our MedVQA pipeline is publicly available at https://github.com/sarahESL/PubMedCLIP.
%R 10.18653/v1/2023.findings-eacl.88
%U https://aclanthology.org/2023.findings-eacl.88
%U https://doi.org/10.18653/v1/2023.findings-eacl.88
%P 1181-1193
Markdown (Informal)
[PubMedCLIP: How Much Does CLIP Benefit Visual Question Answering in the Medical Domain?](https://aclanthology.org/2023.findings-eacl.88) (Eslami et al., Findings 2023)
ACL