@inproceedings{elshehaby-etal-2026-sparse,
title = "Sparse Category Routing and Fairness-Aware Optimization for Medical Decision Extraction",
author = "Elshehaby, Ahmed and
Abdalla, Mohamed and
Mohamed, Youssef",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-2.27/",
pages = "201--212",
ISBN = "979-8-89176-435-4",
abstract = "Extracting structured medical decisions fromICU discharge summaries is hard because oflong documents, severe category imbalanceacross nine DICTUM decision types, and afairness-aware evaluation that penalizes incon-sistent performance across demographic sub-groups. We present our system for the MedEx-ACT 2026 shared task (Elgaar et al., 2026),which fine-tunes BiomedBERT with a com-posite loss combining label-smoothed cross-entropy, a soft token-F1 auxiliary term, andR-Drop regularization. At inference time weapply a deterministic ensemble: half-offsetsliding-window augmentation across four win-dow configurations, dual-branch logit aggrega-tion from the same checkpoint, per-categorylength calibration on the Anchor Branch, andsparse routing of categories 4 and 7 to a context-weighted specialist branch motivated by theirunusual span-length distributions. Adding R-Drop improved validation Overall{\_}F1 by 1.24points over the CE + soft-F1 baseline, with alarger 1.70-point gain on Worst-Group F1. Ourbest submission achieves Span F1 of 0.4900,Token F1 of 0.6796, and an official Overall{\_}F1of 0.5724, with the African American subgroupas the Worst-Group bottleneck at Base{\_}Score0.5601"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="elshehaby-etal-2026-sparse">
<titleInfo>
<title>Sparse Category Routing and Fairness-Aware Optimization for Medical Decision Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Elshehaby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Abdalla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youssef</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the BioNLP 2026 (Shared Tasks)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-435-4</identifier>
</relatedItem>
<abstract>Extracting structured medical decisions fromICU discharge summaries is hard because oflong documents, severe category imbalanceacross nine DICTUM decision types, and afairness-aware evaluation that penalizes incon-sistent performance across demographic sub-groups. We present our system for the MedEx-ACT 2026 shared task (Elgaar et al., 2026),which fine-tunes BiomedBERT with a com-posite loss combining label-smoothed cross-entropy, a soft token-F1 auxiliary term, andR-Drop regularization. At inference time weapply a deterministic ensemble: half-offsetsliding-window augmentation across four win-dow configurations, dual-branch logit aggrega-tion from the same checkpoint, per-categorylength calibration on the Anchor Branch, andsparse routing of categories 4 and 7 to a context-weighted specialist branch motivated by theirunusual span-length distributions. Adding R-Drop improved validation Overall_F1 by 1.24points over the CE + soft-F1 baseline, with alarger 1.70-point gain on Worst-Group F1. Ourbest submission achieves Span F1 of 0.4900,Token F1 of 0.6796, and an official Overall_F1of 0.5724, with the African American subgroupas the Worst-Group bottleneck at Base_Score0.5601</abstract>
<identifier type="citekey">elshehaby-etal-2026-sparse</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-2.27/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>201</start>
<end>212</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sparse Category Routing and Fairness-Aware Optimization for Medical Decision Extraction
%A Elshehaby, Ahmed
%A Abdalla, Mohamed
%A Mohamed, Youssef
%Y Gupta, Deepak
%Y Demner-Fushman, Dina
%S Proceedings of the BioNLP 2026 (Shared Tasks)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-435-4
%F elshehaby-etal-2026-sparse
%X Extracting structured medical decisions fromICU discharge summaries is hard because oflong documents, severe category imbalanceacross nine DICTUM decision types, and afairness-aware evaluation that penalizes incon-sistent performance across demographic sub-groups. We present our system for the MedEx-ACT 2026 shared task (Elgaar et al., 2026),which fine-tunes BiomedBERT with a com-posite loss combining label-smoothed cross-entropy, a soft token-F1 auxiliary term, andR-Drop regularization. At inference time weapply a deterministic ensemble: half-offsetsliding-window augmentation across four win-dow configurations, dual-branch logit aggrega-tion from the same checkpoint, per-categorylength calibration on the Anchor Branch, andsparse routing of categories 4 and 7 to a context-weighted specialist branch motivated by theirunusual span-length distributions. Adding R-Drop improved validation Overall_F1 by 1.24points over the CE + soft-F1 baseline, with alarger 1.70-point gain on Worst-Group F1. Ourbest submission achieves Span F1 of 0.4900,Token F1 of 0.6796, and an official Overall_F1of 0.5724, with the African American subgroupas the Worst-Group bottleneck at Base_Score0.5601
%U https://aclanthology.org/2026.bionlp-2.27/
%P 201-212
Markdown (Informal)
[Sparse Category Routing and Fairness-Aware Optimization for Medical Decision Extraction](https://aclanthology.org/2026.bionlp-2.27/) (Elshehaby et al., BioNLP 2026)
ACL