@inproceedings{aal-abdulsalam-etal-2026-multi,
title = "A Multi-Agent Open-Source {LLM} for Structured Cancer Registry Information Extraction from Pathology and Medical Reports",
author = "Aal Abdulsalam, Abdulrahman and
Al Zaabi, Adhari and
Jeeballah, Riham and
El Keraby, Habiba",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.43/",
pages = "531--551",
ISBN = "979-8-89176-434-7",
abstract = "Extracting structured cancer registry information from pathology and medical reports is challenging due to heterogeneous reporting styles and implicit clinical reasoning. We propose a modular multi-agent framework that decomposes registry abstraction into semantic chunking, retrieval, field-specific extraction, validation, evaluation, and aggregation stages. The dataset includes 818 annotated cancer cases from Sultan Qaboos University Hospital. Evaluation in this study focuses on breast (n=454) and colorectal (n=174) reports across grade, morphology, TNM staging, and laterality extraction tasks. The framework is compared against prompt-based LLaMA 3.3 baselines using accuracy and weighted/macro F1-score metrics. The proposed framework improved performance in context-dependent tasks, particularly grade extraction, where weighted F1-score increased from 0.71 to 0.78 for breast cancer and from 0.56 to 0.67 for colorectal cancer. Improvements were also observed for colorectal laterality extraction. For other extraction tasks, particularly highly structured tasks such as TNM staging and morphology extraction, the multi-agent framework achieved performance comparable to direct prompting. Although the baseline achieved slightly higher average weighted F1-scores overall, the proposed framework provides improved modularity, traceability, and pipeline-level interpretability through explicit intermediate reasoning stages, supporting error analysis and future clinician-guided refinement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aal-abdulsalam-etal-2026-multi">
<titleInfo>
<title>A Multi-Agent Open-Source LLM for Structured Cancer Registry Information Extraction from Pathology and Medical Reports</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abdulrahman</namePart>
<namePart type="family">Aal Abdulsalam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adhari</namePart>
<namePart type="family">Al Zaabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Riham</namePart>
<namePart type="family">Jeeballah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Habiba</namePart>
<namePart type="family">El Keraby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Extracting structured cancer registry information from pathology and medical reports is challenging due to heterogeneous reporting styles and implicit clinical reasoning. We propose a modular multi-agent framework that decomposes registry abstraction into semantic chunking, retrieval, field-specific extraction, validation, evaluation, and aggregation stages. The dataset includes 818 annotated cancer cases from Sultan Qaboos University Hospital. Evaluation in this study focuses on breast (n=454) and colorectal (n=174) reports across grade, morphology, TNM staging, and laterality extraction tasks. The framework is compared against prompt-based LLaMA 3.3 baselines using accuracy and weighted/macro F1-score metrics. The proposed framework improved performance in context-dependent tasks, particularly grade extraction, where weighted F1-score increased from 0.71 to 0.78 for breast cancer and from 0.56 to 0.67 for colorectal cancer. Improvements were also observed for colorectal laterality extraction. For other extraction tasks, particularly highly structured tasks such as TNM staging and morphology extraction, the multi-agent framework achieved performance comparable to direct prompting. Although the baseline achieved slightly higher average weighted F1-scores overall, the proposed framework provides improved modularity, traceability, and pipeline-level interpretability through explicit intermediate reasoning stages, supporting error analysis and future clinician-guided refinement.</abstract>
<identifier type="citekey">aal-abdulsalam-etal-2026-multi</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.43/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>531</start>
<end>551</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Multi-Agent Open-Source LLM for Structured Cancer Registry Information Extraction from Pathology and Medical Reports
%A Aal Abdulsalam, Abdulrahman
%A Al Zaabi, Adhari
%A Jeeballah, Riham
%A El Keraby, Habiba
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F aal-abdulsalam-etal-2026-multi
%X Extracting structured cancer registry information from pathology and medical reports is challenging due to heterogeneous reporting styles and implicit clinical reasoning. We propose a modular multi-agent framework that decomposes registry abstraction into semantic chunking, retrieval, field-specific extraction, validation, evaluation, and aggregation stages. The dataset includes 818 annotated cancer cases from Sultan Qaboos University Hospital. Evaluation in this study focuses on breast (n=454) and colorectal (n=174) reports across grade, morphology, TNM staging, and laterality extraction tasks. The framework is compared against prompt-based LLaMA 3.3 baselines using accuracy and weighted/macro F1-score metrics. The proposed framework improved performance in context-dependent tasks, particularly grade extraction, where weighted F1-score increased from 0.71 to 0.78 for breast cancer and from 0.56 to 0.67 for colorectal cancer. Improvements were also observed for colorectal laterality extraction. For other extraction tasks, particularly highly structured tasks such as TNM staging and morphology extraction, the multi-agent framework achieved performance comparable to direct prompting. Although the baseline achieved slightly higher average weighted F1-scores overall, the proposed framework provides improved modularity, traceability, and pipeline-level interpretability through explicit intermediate reasoning stages, supporting error analysis and future clinician-guided refinement.
%U https://aclanthology.org/2026.bionlp-1.43/
%P 531-551
Markdown (Informal)
[A Multi-Agent Open-Source LLM for Structured Cancer Registry Information Extraction from Pathology and Medical Reports](https://aclanthology.org/2026.bionlp-1.43/) (Aal Abdulsalam et al., BioNLP 2026)
ACL