@inproceedings{acitores-cortina-etal-2026-overview,
title = "Overview of {\#}{SMM}4{H}-{H}ea{RD} 2026 {--} Task 6: Predicting {TNM} staging from pathology reports",
author = "Acitores Cortina, Jose Miguel and
Berkowitz, Jacob S. and
Friedrich, Nadine A. and
Tatonetti, Nicholas P",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.smm4h-1.50/",
pages = "332--337",
ISBN = "979-8-89176-432-3",
abstract = "This paper provides an overview of Task 6 from the Social Media Mining for Health/Health Real-World Data shared task ({\#}SMM4H-HeaRD 2026), which focused on predicting TNM staging from pathology reports from TCGA. Seven teams submitted systems spanning fine-tuned clinical encoders, open-source generative LLMs, and closed-source API models. On a straightforward test set, most teams achieved near-perfect F1 scores (average 0.993, 0.972, and 0.957 for T, N, and M). However, on a harder tiebreak set where explicit TNM notation was removed and staging had to be inferred from clinical descriptions, performance dropped substantially (average 0.725, 0.783, and 0.846). Notably, the two teams using large closed-source API models generalized best to the harder set, achieving the highest T and N scores despite not leading on the easy set. These results suggest that while fine-tuned domain-specific encoders excel at surface-level extraction, larger general-purpose LLMs may be more robust when staging must be inferred from contextual clinical findings. All teams surpassed baseline overall performance on both test sets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="acitores-cortina-etal-2026-overview">
<titleInfo>
<title>Overview of #SMM4H-HeaRD 2026 – Task 6: Predicting TNM staging from pathology reports</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jose</namePart>
<namePart type="given">Miguel</namePart>
<namePart type="family">Acitores Cortina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Berkowitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadine</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Friedrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Tatonetti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Lopez-Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graciela</namePart>
<namePart type="family">Gonzalez-Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-432-3</identifier>
</relatedItem>
<abstract>This paper provides an overview of Task 6 from the Social Media Mining for Health/Health Real-World Data shared task (#SMM4H-HeaRD 2026), which focused on predicting TNM staging from pathology reports from TCGA. Seven teams submitted systems spanning fine-tuned clinical encoders, open-source generative LLMs, and closed-source API models. On a straightforward test set, most teams achieved near-perfect F1 scores (average 0.993, 0.972, and 0.957 for T, N, and M). However, on a harder tiebreak set where explicit TNM notation was removed and staging had to be inferred from clinical descriptions, performance dropped substantially (average 0.725, 0.783, and 0.846). Notably, the two teams using large closed-source API models generalized best to the harder set, achieving the highest T and N scores despite not leading on the easy set. These results suggest that while fine-tuned domain-specific encoders excel at surface-level extraction, larger general-purpose LLMs may be more robust when staging must be inferred from contextual clinical findings. All teams surpassed baseline overall performance on both test sets.</abstract>
<identifier type="citekey">acitores-cortina-etal-2026-overview</identifier>
<location>
<url>https://aclanthology.org/2026.smm4h-1.50/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>332</start>
<end>337</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Overview of #SMM4H-HeaRD 2026 – Task 6: Predicting TNM staging from pathology reports
%A Acitores Cortina, Jose Miguel
%A Berkowitz, Jacob S.
%A Friedrich, Nadine A.
%A Tatonetti, Nicholas P.
%Y Lopez-Garcia, Guillermo
%Y Gonzalez-Hernandez, Graciela
%S Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-432-3
%F acitores-cortina-etal-2026-overview
%X This paper provides an overview of Task 6 from the Social Media Mining for Health/Health Real-World Data shared task (#SMM4H-HeaRD 2026), which focused on predicting TNM staging from pathology reports from TCGA. Seven teams submitted systems spanning fine-tuned clinical encoders, open-source generative LLMs, and closed-source API models. On a straightforward test set, most teams achieved near-perfect F1 scores (average 0.993, 0.972, and 0.957 for T, N, and M). However, on a harder tiebreak set where explicit TNM notation was removed and staging had to be inferred from clinical descriptions, performance dropped substantially (average 0.725, 0.783, and 0.846). Notably, the two teams using large closed-source API models generalized best to the harder set, achieving the highest T and N scores despite not leading on the easy set. These results suggest that while fine-tuned domain-specific encoders excel at surface-level extraction, larger general-purpose LLMs may be more robust when staging must be inferred from contextual clinical findings. All teams surpassed baseline overall performance on both test sets.
%U https://aclanthology.org/2026.smm4h-1.50/
%P 332-337
Markdown (Informal)
[Overview of #SMM4H-HeaRD 2026 – Task 6: Predicting TNM staging from pathology reports](https://aclanthology.org/2026.smm4h-1.50/) (Acitores Cortina et al., SMM4H 2026)
ACL