@inproceedings{tian-dogan-2026-seahawk,
title = "Seahawk at {M}ed{G}en{V}id{QA} 2026: {LLM} Segment-Range Selection for Medical Visual Answer Localization",
author = "Tian, Xiaotian and
Dogan, Gulustan",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-2.34/",
pages = "257--261",
ISBN = "979-8-89176-435-4",
abstract = "Medical visual answer localization requires identifying the temporal span in a video where a medical question is answered or visually explained. We present a simple retrieval-and-selection pipeline for Task C that treats visual answer localization as segment-level answer paragraph selection over timestamped video transcripts. Given a question and a segmented transcript, our system prompts DeepSeek to select a contiguous range of transcript segments rather than directly generating timestamps. The final start and end times are then computed deterministically from the selected segment boundaries, decreasing the risk of hallucinated or malformed temporal outputs. To support long videos, we apply overlapping sliding-window prompting and rank candidate ranges using lexical question. In a 20-sample sanity check on test dataset, a completeness-biased configuration achieved an mIoU of 0.3217, while a shorter duration-penalized configuration improved performance to 0.4815. These results suggest that constrained LLM-based segment selection, combined with deterministic timestamp extraction, is a practical baseline for medical visual answer localization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tian-dogan-2026-seahawk">
<titleInfo>
<title>Seahawk at MedGenVidQA 2026: LLM Segment-Range Selection for Medical Visual Answer Localization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiaotian</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gulustan</namePart>
<namePart type="family">Dogan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the BioNLP 2026 (Shared Tasks)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-435-4</identifier>
</relatedItem>
<abstract>Medical visual answer localization requires identifying the temporal span in a video where a medical question is answered or visually explained. We present a simple retrieval-and-selection pipeline for Task C that treats visual answer localization as segment-level answer paragraph selection over timestamped video transcripts. Given a question and a segmented transcript, our system prompts DeepSeek to select a contiguous range of transcript segments rather than directly generating timestamps. The final start and end times are then computed deterministically from the selected segment boundaries, decreasing the risk of hallucinated or malformed temporal outputs. To support long videos, we apply overlapping sliding-window prompting and rank candidate ranges using lexical question. In a 20-sample sanity check on test dataset, a completeness-biased configuration achieved an mIoU of 0.3217, while a shorter duration-penalized configuration improved performance to 0.4815. These results suggest that constrained LLM-based segment selection, combined with deterministic timestamp extraction, is a practical baseline for medical visual answer localization.</abstract>
<identifier type="citekey">tian-dogan-2026-seahawk</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-2.34/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>257</start>
<end>261</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Seahawk at MedGenVidQA 2026: LLM Segment-Range Selection for Medical Visual Answer Localization
%A Tian, Xiaotian
%A Dogan, Gulustan
%Y Gupta, Deepak
%Y Demner-Fushman, Dina
%S Proceedings of the BioNLP 2026 (Shared Tasks)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-435-4
%F tian-dogan-2026-seahawk
%X Medical visual answer localization requires identifying the temporal span in a video where a medical question is answered or visually explained. We present a simple retrieval-and-selection pipeline for Task C that treats visual answer localization as segment-level answer paragraph selection over timestamped video transcripts. Given a question and a segmented transcript, our system prompts DeepSeek to select a contiguous range of transcript segments rather than directly generating timestamps. The final start and end times are then computed deterministically from the selected segment boundaries, decreasing the risk of hallucinated or malformed temporal outputs. To support long videos, we apply overlapping sliding-window prompting and rank candidate ranges using lexical question. In a 20-sample sanity check on test dataset, a completeness-biased configuration achieved an mIoU of 0.3217, while a shorter duration-penalized configuration improved performance to 0.4815. These results suggest that constrained LLM-based segment selection, combined with deterministic timestamp extraction, is a practical baseline for medical visual answer localization.
%U https://aclanthology.org/2026.bionlp-2.34/
%P 257-261
Markdown (Informal)
[Seahawk at MedGenVidQA 2026: LLM Segment-Range Selection for Medical Visual Answer Localization](https://aclanthology.org/2026.bionlp-2.34/) (Tian & Dogan, BioNLP 2026)
ACL