@inproceedings{madrueno-etal-2026-urjc,
title = "{URJC}-Team at {\#}{SMM}4{H}-{H}ea{RD} 2026: {TNM} Stage Extraction with a Regex-{LLM} Workflow",
author = "Madrue{\~n}o, Natalia and
P{\'e}rez, Jose Walter Hern{\'a}ndez and
Fern{\'a}ndez, Rub{\'e}n R. and
Montalvo, Soto",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.smm4h-1.22/",
pages = "133--138",
ISBN = "979-8-89176-432-3",
abstract = "TNM cancer staging is a critical process for characterizing tumor burden and guiding clinical decisions. Nevertheless, its automated extraction remains challenging due to the unstructured and heterogeneous nature of free-text pathology reports. This paper describes the participation of the URJC-Team in Task 6 of the Social Media Mining for Health/Health Real-World Data ({\#}SMM4H-HeaRD) 2026 Shared Tasks. It focuses on predicting TNM staging from pathology reports. The proposed workflow combines hand-crafted regular expressions with a Large Language Model (LLM). First, explicit TNM mentions are extracted using rule-based patterns. Then, any stage not recovered by these rules is inferred by an LLM. Overall, the proposal provides competitive results across all official shared-task phases."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="madrueno-etal-2026-urjc">
<titleInfo>
<title>URJC-Team at #SMM4H-HeaRD 2026: TNM Stage Extraction with a Regex-LLM Workflow</title>
</titleInfo>
<name type="personal">
<namePart type="given">Natalia</namePart>
<namePart type="family">Madrueño</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="given">Walter</namePart>
<namePart type="given">Hernández</namePart>
<namePart type="family">Pérez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rubén</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soto</namePart>
<namePart type="family">Montalvo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Lopez-Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graciela</namePart>
<namePart type="family">Gonzalez-Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-432-3</identifier>
</relatedItem>
<abstract>TNM cancer staging is a critical process for characterizing tumor burden and guiding clinical decisions. Nevertheless, its automated extraction remains challenging due to the unstructured and heterogeneous nature of free-text pathology reports. This paper describes the participation of the URJC-Team in Task 6 of the Social Media Mining for Health/Health Real-World Data (#SMM4H-HeaRD) 2026 Shared Tasks. It focuses on predicting TNM staging from pathology reports. The proposed workflow combines hand-crafted regular expressions with a Large Language Model (LLM). First, explicit TNM mentions are extracted using rule-based patterns. Then, any stage not recovered by these rules is inferred by an LLM. Overall, the proposal provides competitive results across all official shared-task phases.</abstract>
<identifier type="citekey">madrueno-etal-2026-urjc</identifier>
<location>
<url>https://aclanthology.org/2026.smm4h-1.22/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>133</start>
<end>138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T URJC-Team at #SMM4H-HeaRD 2026: TNM Stage Extraction with a Regex-LLM Workflow
%A Madrueño, Natalia
%A Pérez, Jose Walter Hernández
%A Fernández, Rubén R.
%A Montalvo, Soto
%Y Lopez-Garcia, Guillermo
%Y Gonzalez-Hernandez, Graciela
%S Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-432-3
%F madrueno-etal-2026-urjc
%X TNM cancer staging is a critical process for characterizing tumor burden and guiding clinical decisions. Nevertheless, its automated extraction remains challenging due to the unstructured and heterogeneous nature of free-text pathology reports. This paper describes the participation of the URJC-Team in Task 6 of the Social Media Mining for Health/Health Real-World Data (#SMM4H-HeaRD) 2026 Shared Tasks. It focuses on predicting TNM staging from pathology reports. The proposed workflow combines hand-crafted regular expressions with a Large Language Model (LLM). First, explicit TNM mentions are extracted using rule-based patterns. Then, any stage not recovered by these rules is inferred by an LLM. Overall, the proposal provides competitive results across all official shared-task phases.
%U https://aclanthology.org/2026.smm4h-1.22/
%P 133-138
Markdown (Informal)
[URJC-Team at #SMM4H-HeaRD 2026: TNM Stage Extraction with a Regex-LLM Workflow](https://aclanthology.org/2026.smm4h-1.22/) (Madrueño et al., SMM4H 2026)
ACL