@inproceedings{stachura-etal-2026-expert,
title = "Expert-Guided Schema-Based Structured Extraction from {CONSORT} Diagrams Using Vision-Language Models",
author = "Stachura, Damian and
Przechera, Bartosz and
Opa?ek, Monika and
Sadowska, Ewelina and
Borowiack, Ewa and
Nowak, Artur",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.77/",
pages = "955--969",
ISBN = "979-8-89176-434-7",
abstract = "Visual-language models (VLMs) are rapidly advancing on tasks that require visual understanding of text, tables, plots, and diagrams. Yet extracting structured information from text-heavy scientific diagrams remains challenging, as it requires not only OCR but also recovery of layout, grouping, and flow relationships. We study this problem in the context of CONSORT flow diagrams, which summarize participant screening, randomization, follow-up, and analysis in randomized controlled trials. We introduce a 200-example benchmark of PubMed Central diagrams, annotated by a biomedical team specializing in systematic literature reviews and clinical evidence extraction, and evaluate schema-constrained CONSORT extraction across proprietary and open-weight model families. Using structure-aware metrics, we compare single-pass and stepwise extraction strategies. Expert-guided single-pass extraction performs best for proprietary frontier models, with Gemini 3 Pro achieving the strongest overall results, whereas stepwise prompting improves less capable open-weight models on challenging arm-level extraction. These results offer practical deployment guidance and suggest that high-quality schema-constrained extraction is feasible, but not yet solved."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="stachura-etal-2026-expert">
<titleInfo>
<title>Expert-Guided Schema-Based Structured Extraction from CONSORT Diagrams Using Vision-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Damian</namePart>
<namePart type="family">Stachura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bartosz</namePart>
<namePart type="family">Przechera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Monika</namePart>
<namePart type="family">Opa?ek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ewelina</namePart>
<namePart type="family">Sadowska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ewa</namePart>
<namePart type="family">Borowiack</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artur</namePart>
<namePart type="family">Nowak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Visual-language models (VLMs) are rapidly advancing on tasks that require visual understanding of text, tables, plots, and diagrams. Yet extracting structured information from text-heavy scientific diagrams remains challenging, as it requires not only OCR but also recovery of layout, grouping, and flow relationships. We study this problem in the context of CONSORT flow diagrams, which summarize participant screening, randomization, follow-up, and analysis in randomized controlled trials. We introduce a 200-example benchmark of PubMed Central diagrams, annotated by a biomedical team specializing in systematic literature reviews and clinical evidence extraction, and evaluate schema-constrained CONSORT extraction across proprietary and open-weight model families. Using structure-aware metrics, we compare single-pass and stepwise extraction strategies. Expert-guided single-pass extraction performs best for proprietary frontier models, with Gemini 3 Pro achieving the strongest overall results, whereas stepwise prompting improves less capable open-weight models on challenging arm-level extraction. These results offer practical deployment guidance and suggest that high-quality schema-constrained extraction is feasible, but not yet solved.</abstract>
<identifier type="citekey">stachura-etal-2026-expert</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.77/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>955</start>
<end>969</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Expert-Guided Schema-Based Structured Extraction from CONSORT Diagrams Using Vision-Language Models
%A Stachura, Damian
%A Przechera, Bartosz
%A Opa?ek, Monika
%A Sadowska, Ewelina
%A Borowiack, Ewa
%A Nowak, Artur
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F stachura-etal-2026-expert
%X Visual-language models (VLMs) are rapidly advancing on tasks that require visual understanding of text, tables, plots, and diagrams. Yet extracting structured information from text-heavy scientific diagrams remains challenging, as it requires not only OCR but also recovery of layout, grouping, and flow relationships. We study this problem in the context of CONSORT flow diagrams, which summarize participant screening, randomization, follow-up, and analysis in randomized controlled trials. We introduce a 200-example benchmark of PubMed Central diagrams, annotated by a biomedical team specializing in systematic literature reviews and clinical evidence extraction, and evaluate schema-constrained CONSORT extraction across proprietary and open-weight model families. Using structure-aware metrics, we compare single-pass and stepwise extraction strategies. Expert-guided single-pass extraction performs best for proprietary frontier models, with Gemini 3 Pro achieving the strongest overall results, whereas stepwise prompting improves less capable open-weight models on challenging arm-level extraction. These results offer practical deployment guidance and suggest that high-quality schema-constrained extraction is feasible, but not yet solved.
%U https://aclanthology.org/2026.bionlp-1.77/
%P 955-969
Markdown (Informal)
[Expert-Guided Schema-Based Structured Extraction from CONSORT Diagrams Using Vision-Language Models](https://aclanthology.org/2026.bionlp-1.77/) (Stachura et al., BioNLP 2026)
ACL