@inproceedings{demirhan-zadrozny-2026-uncc,
title = "{UNCC} at {M}ed{G}en{V}id{QA} 2026: Structured Temporal Grounding for Medical Video Question Answering",
author = "Demirhan, Hilmi and
Zadrozny, Wlodek",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-2.35/",
pages = "262--269",
ISBN = "979-8-89176-435-4",
abstract = "MedGenVidQA 2026 Task C evaluates visualanswer localization in medical videos. Thesystem receives a video and a question, then returns the start and end time of the visual answer.Our framework used timestamped automaticspeech recognition (ASR) as a proposal sourcerather than as a final boundary label. The framework generated transcript tables, phase maps,lexical and dense candidate windows, schemaconstrained ranking inputs, selective key-framechecks, and a deterministic validation pass forthe final JSON file. The ranker selected amongbounded candidate intervals instead of generating arbitrary timestamps over a full transcript.Each output can be traced to segment identifiers, candidate source families, selected anchors, phase labels, and validation flags. Ourbest run ranked fifth among six participant systems, with 62.50 IoU@0.3, 36.25 IoU@0.5,22.50 IoU@0.7, and 42.57 mIoU. The threshold pattern suggests that coarse temporal retrieval was more reliable than strict start-endlocalization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="demirhan-zadrozny-2026-uncc">
<titleInfo>
<title>UNCC at MedGenVidQA 2026: Structured Temporal Grounding for Medical Video Question Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hilmi</namePart>
<namePart type="family">Demirhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wlodek</namePart>
<namePart type="family">Zadrozny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the BioNLP 2026 (Shared Tasks)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-435-4</identifier>
</relatedItem>
<abstract>MedGenVidQA 2026 Task C evaluates visualanswer localization in medical videos. Thesystem receives a video and a question, then returns the start and end time of the visual answer.Our framework used timestamped automaticspeech recognition (ASR) as a proposal sourcerather than as a final boundary label. The framework generated transcript tables, phase maps,lexical and dense candidate windows, schemaconstrained ranking inputs, selective key-framechecks, and a deterministic validation pass forthe final JSON file. The ranker selected amongbounded candidate intervals instead of generating arbitrary timestamps over a full transcript.Each output can be traced to segment identifiers, candidate source families, selected anchors, phase labels, and validation flags. Ourbest run ranked fifth among six participant systems, with 62.50 IoU@0.3, 36.25 IoU@0.5,22.50 IoU@0.7, and 42.57 mIoU. The threshold pattern suggests that coarse temporal retrieval was more reliable than strict start-endlocalization.</abstract>
<identifier type="citekey">demirhan-zadrozny-2026-uncc</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-2.35/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>262</start>
<end>269</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T UNCC at MedGenVidQA 2026: Structured Temporal Grounding for Medical Video Question Answering
%A Demirhan, Hilmi
%A Zadrozny, Wlodek
%Y Gupta, Deepak
%Y Demner-Fushman, Dina
%S Proceedings of the BioNLP 2026 (Shared Tasks)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-435-4
%F demirhan-zadrozny-2026-uncc
%X MedGenVidQA 2026 Task C evaluates visualanswer localization in medical videos. Thesystem receives a video and a question, then returns the start and end time of the visual answer.Our framework used timestamped automaticspeech recognition (ASR) as a proposal sourcerather than as a final boundary label. The framework generated transcript tables, phase maps,lexical and dense candidate windows, schemaconstrained ranking inputs, selective key-framechecks, and a deterministic validation pass forthe final JSON file. The ranker selected amongbounded candidate intervals instead of generating arbitrary timestamps over a full transcript.Each output can be traced to segment identifiers, candidate source families, selected anchors, phase labels, and validation flags. Ourbest run ranked fifth among six participant systems, with 62.50 IoU@0.3, 36.25 IoU@0.5,22.50 IoU@0.7, and 42.57 mIoU. The threshold pattern suggests that coarse temporal retrieval was more reliable than strict start-endlocalization.
%U https://aclanthology.org/2026.bionlp-2.35/
%P 262-269
Markdown (Informal)
[UNCC at MedGenVidQA 2026: Structured Temporal Grounding for Medical Video Question Answering](https://aclanthology.org/2026.bionlp-2.35/) (Demirhan & Zadrozny, BioNLP 2026)
ACL