@inproceedings{gai-etal-2025-medthink,
title = "{M}ed{T}hink: A Rationale-Guided Framework for Explaining Medical Visual Question Answering",
author = "Gai, Xiaotang and
Zhou, Chenyi and
Liu, Jiaxiang and
Feng, Yang and
Wu, Jian and
Liu, Zuozhu",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.415/",
pages = "7438--7450",
ISBN = "979-8-89176-195-7",
abstract = "Medical Visual Question Answering (Med-VQA), which offers language responses to image-based medical inquiries, represents a challenging task and significant advancement in healthcare. It assists medical experts to swiftly interpret medical images, thereby enabling faster and more accurate diagnoses. However, the model interpretability and transparency of existing Med-VQA solutions are often limited, posing challenges in understanding their decision-making processes. To address this issue, we devise a semi-automated annotation process to streamline data preparation and build new benchmark Med-VQA datasets R-RAD, R-SLAKE and R-Path. These datasets provide intermediate medical decision-making rationales generated by multimodal large language models and human annotations for question-answering pairs in existing Med-VQA datasets, i.e., VQA-RAD, SLAKE and PathVQA. Moreover, we design a novel framework, MedThink, which finetunes lightweight pretrained generative models by incorporating medical decision-making rationales. MedThink includes three distinct strategies to generate decision outcomes and corresponding rationales, clearly showcasing the medical decision-making process during reasoning. Our comprehensive experiments show that our method achieves an accuracy of 83.5{\%} on R-RAD, 86.3{\%} on R-SLAKE and 87.2{\%} on R-Path. These results significantly exceed those of existing state-of-the-art models with comparable parameters. Datasets and code are available at https://github.com/Tang-xiaoxiao/Medthink."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gai-etal-2025-medthink">
<titleInfo>
<title>MedThink: A Rationale-Guided Framework for Explaining Medical Visual Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaotang</namePart>
<namePart type="family">Gai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenyi</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxiang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zuozhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Medical Visual Question Answering (Med-VQA), which offers language responses to image-based medical inquiries, represents a challenging task and significant advancement in healthcare. It assists medical experts to swiftly interpret medical images, thereby enabling faster and more accurate diagnoses. However, the model interpretability and transparency of existing Med-VQA solutions are often limited, posing challenges in understanding their decision-making processes. To address this issue, we devise a semi-automated annotation process to streamline data preparation and build new benchmark Med-VQA datasets R-RAD, R-SLAKE and R-Path. These datasets provide intermediate medical decision-making rationales generated by multimodal large language models and human annotations for question-answering pairs in existing Med-VQA datasets, i.e., VQA-RAD, SLAKE and PathVQA. Moreover, we design a novel framework, MedThink, which finetunes lightweight pretrained generative models by incorporating medical decision-making rationales. MedThink includes three distinct strategies to generate decision outcomes and corresponding rationales, clearly showcasing the medical decision-making process during reasoning. Our comprehensive experiments show that our method achieves an accuracy of 83.5% on R-RAD, 86.3% on R-SLAKE and 87.2% on R-Path. These results significantly exceed those of existing state-of-the-art models with comparable parameters. Datasets and code are available at https://github.com/Tang-xiaoxiao/Medthink.</abstract>
<identifier type="citekey">gai-etal-2025-medthink</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.415/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7438</start>
<end>7450</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MedThink: A Rationale-Guided Framework for Explaining Medical Visual Question Answering
%A Gai, Xiaotang
%A Zhou, Chenyi
%A Liu, Jiaxiang
%A Feng, Yang
%A Wu, Jian
%A Liu, Zuozhu
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F gai-etal-2025-medthink
%X Medical Visual Question Answering (Med-VQA), which offers language responses to image-based medical inquiries, represents a challenging task and significant advancement in healthcare. It assists medical experts to swiftly interpret medical images, thereby enabling faster and more accurate diagnoses. However, the model interpretability and transparency of existing Med-VQA solutions are often limited, posing challenges in understanding their decision-making processes. To address this issue, we devise a semi-automated annotation process to streamline data preparation and build new benchmark Med-VQA datasets R-RAD, R-SLAKE and R-Path. These datasets provide intermediate medical decision-making rationales generated by multimodal large language models and human annotations for question-answering pairs in existing Med-VQA datasets, i.e., VQA-RAD, SLAKE and PathVQA. Moreover, we design a novel framework, MedThink, which finetunes lightweight pretrained generative models by incorporating medical decision-making rationales. MedThink includes three distinct strategies to generate decision outcomes and corresponding rationales, clearly showcasing the medical decision-making process during reasoning. Our comprehensive experiments show that our method achieves an accuracy of 83.5% on R-RAD, 86.3% on R-SLAKE and 87.2% on R-Path. These results significantly exceed those of existing state-of-the-art models with comparable parameters. Datasets and code are available at https://github.com/Tang-xiaoxiao/Medthink.
%U https://aclanthology.org/2025.findings-naacl.415/
%P 7438-7450
Markdown (Informal)
[MedThink: A Rationale-Guided Framework for Explaining Medical Visual Question Answering](https://aclanthology.org/2025.findings-naacl.415/) (Gai et al., Findings 2025)
ACL