@inproceedings{menco-tovar-etal-2026-verbanexai,
title = "{V}erba{N}ex{AI} at {C}linical{S}kill{QA}: From Visual Evidence to Procedural Order A Two-Stage Generative Vision-Language Framework for {C}lin{S}kill{QA}",
author = "Menco Tovar, Andrea and
Serrano, Jairo E. and
Puertas, Edwin and
Martinez-Santos, Juan Carlos",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-2.2/",
pages = "6--12",
ISBN = "979-8-89176-435-4",
abstract = "This work addresses the temporal ordering task of clinical frames in the Basic Life Support (BLS) subset of ClinSkillQA. A two-stage hybrid pipeline based on Qwen2-VL-2B-Instruct in a zero-shot configuration is proposed. In Stage 1, each image is processed independently to extract factual visual evidence, which is then transformed, using deterministic rules, into a structured representation. In Stage 2, ordering is formulated as an ordinal scoring task over procedural stages, with ties broken using PCA applied to multimodal embeddings. Evaluation followed the official benchmark protocol, considering Task Accuracy, Pairwise Accuracy, and BERTScore. In the test phase, the system achieved Task Accuracy = 0.17, Pairwise Micro Accuracy = 0.60, and BERT F1 = 0.71, with complete coverage in both predictions and rationales. The results demonstrate an interpretable and reproducible foundation, although challenges in fine-grained temporal discrimination remain."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="menco-tovar-etal-2026-verbanexai">
<titleInfo>
<title>VerbaNexAI at ClinicalSkillQA: From Visual Evidence to Procedural Order A Two-Stage Generative Vision-Language Framework for ClinSkillQA</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Menco Tovar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jairo</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Serrano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edwin</namePart>
<namePart type="family">Puertas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Carlos</namePart>
<namePart type="family">Martinez-Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the BioNLP 2026 (Shared Tasks)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-435-4</identifier>
</relatedItem>
<abstract>This work addresses the temporal ordering task of clinical frames in the Basic Life Support (BLS) subset of ClinSkillQA. A two-stage hybrid pipeline based on Qwen2-VL-2B-Instruct in a zero-shot configuration is proposed. In Stage 1, each image is processed independently to extract factual visual evidence, which is then transformed, using deterministic rules, into a structured representation. In Stage 2, ordering is formulated as an ordinal scoring task over procedural stages, with ties broken using PCA applied to multimodal embeddings. Evaluation followed the official benchmark protocol, considering Task Accuracy, Pairwise Accuracy, and BERTScore. In the test phase, the system achieved Task Accuracy = 0.17, Pairwise Micro Accuracy = 0.60, and BERT F1 = 0.71, with complete coverage in both predictions and rationales. The results demonstrate an interpretable and reproducible foundation, although challenges in fine-grained temporal discrimination remain.</abstract>
<identifier type="citekey">menco-tovar-etal-2026-verbanexai</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-2.2/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>6</start>
<end>12</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VerbaNexAI at ClinicalSkillQA: From Visual Evidence to Procedural Order A Two-Stage Generative Vision-Language Framework for ClinSkillQA
%A Menco Tovar, Andrea
%A Serrano, Jairo E.
%A Puertas, Edwin
%A Martinez-Santos, Juan Carlos
%Y Gupta, Deepak
%Y Demner-Fushman, Dina
%S Proceedings of the BioNLP 2026 (Shared Tasks)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-435-4
%F menco-tovar-etal-2026-verbanexai
%X This work addresses the temporal ordering task of clinical frames in the Basic Life Support (BLS) subset of ClinSkillQA. A two-stage hybrid pipeline based on Qwen2-VL-2B-Instruct in a zero-shot configuration is proposed. In Stage 1, each image is processed independently to extract factual visual evidence, which is then transformed, using deterministic rules, into a structured representation. In Stage 2, ordering is formulated as an ordinal scoring task over procedural stages, with ties broken using PCA applied to multimodal embeddings. Evaluation followed the official benchmark protocol, considering Task Accuracy, Pairwise Accuracy, and BERTScore. In the test phase, the system achieved Task Accuracy = 0.17, Pairwise Micro Accuracy = 0.60, and BERT F1 = 0.71, with complete coverage in both predictions and rationales. The results demonstrate an interpretable and reproducible foundation, although challenges in fine-grained temporal discrimination remain.
%U https://aclanthology.org/2026.bionlp-2.2/
%P 6-12
Markdown (Informal)
[VerbaNexAI at ClinicalSkillQA: From Visual Evidence to Procedural Order A Two-Stage Generative Vision-Language Framework for ClinSkillQA](https://aclanthology.org/2026.bionlp-2.2/) (Menco Tovar et al., BioNLP 2026)
ACL