@inproceedings{gupta-etal-2026-overview,
title = "Overview of the {M}ed{G}en{V}id{QA} 2026 Shared Task on Medical Generative Video Question Answering",
author = "Gupta, Deepak and
Campbell, Collin and
Golnari, Pedram and
Demner-Fushman, Dina",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.88/",
pages = "1089--1100",
ISBN = "979-8-89176-434-7",
abstract = "This paper presents an overview of the MedGenVidQA 2026 shared task on medical video question answering, collocated with the 25th BioNLP workshop at ACL 2026. The shared task addressed three related sub-tasks of the medical multimodal (textual and video) question answering: (i) multimodal retrieval tasks, (ii) multimodal answer generation with citations, and (iii) a visual answer localization task. The key theme of the stated task is to develop reliable multimodal question answering systems for consumers and medical professionals by leveraging generative models. A total of nine teams participated in the shared task challenges and submitted a total of forty-three submissions across all tasks. We performed both automated and human assessments to evaluate the submissions. This paper describes the tasks, datasets, evaluation metrics, participation, and baseline systems for all three tasks. Additionally, we summarize the techniques and results of the evaluation of the various approaches explored by the participating teams. Finally, we discuss the key findings and implications for the development of multimodal medical question answering."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gupta-etal-2026-overview">
<titleInfo>
<title>Overview of the MedGenVidQA 2026 Shared Task on Medical Generative Video Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Collin</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedram</namePart>
<namePart type="family">Golnari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>This paper presents an overview of the MedGenVidQA 2026 shared task on medical video question answering, collocated with the 25th BioNLP workshop at ACL 2026. The shared task addressed three related sub-tasks of the medical multimodal (textual and video) question answering: (i) multimodal retrieval tasks, (ii) multimodal answer generation with citations, and (iii) a visual answer localization task. The key theme of the stated task is to develop reliable multimodal question answering systems for consumers and medical professionals by leveraging generative models. A total of nine teams participated in the shared task challenges and submitted a total of forty-three submissions across all tasks. We performed both automated and human assessments to evaluate the submissions. This paper describes the tasks, datasets, evaluation metrics, participation, and baseline systems for all three tasks. Additionally, we summarize the techniques and results of the evaluation of the various approaches explored by the participating teams. Finally, we discuss the key findings and implications for the development of multimodal medical question answering.</abstract>
<identifier type="citekey">gupta-etal-2026-overview</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.88/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1089</start>
<end>1100</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Overview of the MedGenVidQA 2026 Shared Task on Medical Generative Video Question Answering
%A Gupta, Deepak
%A Campbell, Collin
%A Golnari, Pedram
%A Demner-Fushman, Dina
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F gupta-etal-2026-overview
%X This paper presents an overview of the MedGenVidQA 2026 shared task on medical video question answering, collocated with the 25th BioNLP workshop at ACL 2026. The shared task addressed three related sub-tasks of the medical multimodal (textual and video) question answering: (i) multimodal retrieval tasks, (ii) multimodal answer generation with citations, and (iii) a visual answer localization task. The key theme of the stated task is to develop reliable multimodal question answering systems for consumers and medical professionals by leveraging generative models. A total of nine teams participated in the shared task challenges and submitted a total of forty-three submissions across all tasks. We performed both automated and human assessments to evaluate the submissions. This paper describes the tasks, datasets, evaluation metrics, participation, and baseline systems for all three tasks. Additionally, we summarize the techniques and results of the evaluation of the various approaches explored by the participating teams. Finally, we discuss the key findings and implications for the development of multimodal medical question answering.
%U https://aclanthology.org/2026.bionlp-1.88/
%P 1089-1100
Markdown (Informal)
[Overview of the MedGenVidQA 2026 Shared Task on Medical Generative Video Question Answering](https://aclanthology.org/2026.bionlp-1.88/) (Gupta et al., BioNLP 2026)
ACL