@inproceedings{anastasia-2026-gmail,
title = "No{\_}gmail at {\#}{SMM}4{H}-{H}ea{RD} 2026: Detecting Patient Metadata in {COVID}-19 Scientific Literature: A Comparative Study of Encoder-Only and Autoregressive Language Models",
author = "Anastasia, Stefanescu",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.smm4h-1.46/",
pages = "280--285",
ISBN = "979-8-89176-432-3",
abstract = "Identifying sentences in COVID-19 literature that report patient metadata is an important step in genomic epidemiology, currently requiring costly manual curation. We compare fine-tuned encoder-only models (BERT, BioLinkBERT) and autoregressive LLMs (Llama, Gemma, GPT-OSS) under prompting and fine-tuning regimes, using Focal Loss and undersampling to address severe class imbalance. Encoder-only models substantially outperform autoregressive models: BioLinkBERT-base with Focal Loss achieves macro F1 of 0.76, versus 0.54 for the best fine-tuned autoregressive model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="anastasia-2026-gmail">
<titleInfo>
<title>No_gmail at #SMM4H-HeaRD 2026: Detecting Patient Metadata in COVID-19 Scientific Literature: A Comparative Study of Encoder-Only and Autoregressive Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefanescu</namePart>
<namePart type="family">Anastasia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Lopez-Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graciela</namePart>
<namePart type="family">Gonzalez-Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-432-3</identifier>
</relatedItem>
<abstract>Identifying sentences in COVID-19 literature that report patient metadata is an important step in genomic epidemiology, currently requiring costly manual curation. We compare fine-tuned encoder-only models (BERT, BioLinkBERT) and autoregressive LLMs (Llama, Gemma, GPT-OSS) under prompting and fine-tuning regimes, using Focal Loss and undersampling to address severe class imbalance. Encoder-only models substantially outperform autoregressive models: BioLinkBERT-base with Focal Loss achieves macro F1 of 0.76, versus 0.54 for the best fine-tuned autoregressive model.</abstract>
<identifier type="citekey">anastasia-2026-gmail</identifier>
<location>
<url>https://aclanthology.org/2026.smm4h-1.46/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>280</start>
<end>285</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T No_gmail at #SMM4H-HeaRD 2026: Detecting Patient Metadata in COVID-19 Scientific Literature: A Comparative Study of Encoder-Only and Autoregressive Language Models
%A Anastasia, Stefanescu
%Y Lopez-Garcia, Guillermo
%Y Gonzalez-Hernandez, Graciela
%S Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-432-3
%F anastasia-2026-gmail
%X Identifying sentences in COVID-19 literature that report patient metadata is an important step in genomic epidemiology, currently requiring costly manual curation. We compare fine-tuned encoder-only models (BERT, BioLinkBERT) and autoregressive LLMs (Llama, Gemma, GPT-OSS) under prompting and fine-tuning regimes, using Focal Loss and undersampling to address severe class imbalance. Encoder-only models substantially outperform autoregressive models: BioLinkBERT-base with Focal Loss achieves macro F1 of 0.76, versus 0.54 for the best fine-tuned autoregressive model.
%U https://aclanthology.org/2026.smm4h-1.46/
%P 280-285
Markdown (Informal)
[No_gmail at #SMM4H-HeaRD 2026: Detecting Patient Metadata in COVID-19 Scientific Literature: A Comparative Study of Encoder-Only and Autoregressive Language Models](https://aclanthology.org/2026.smm4h-1.46/) (Anastasia, SMM4H 2026)
ACL