@inproceedings{arnoult-etal-2026-lotusorchid,
title = "{L}otus{O}rchid at {\#}{SMM}4{H}{--}{H}ea{RD} 2026: Fitting pretrained encoders for {D}utch medical data",
author = "Arnoult, Sophie and
Chen, Shutao and
Vossen, Piek",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.smm4h-1.23/",
pages = "139--145",
ISBN = "979-8-89176-432-3",
abstract = "This paper presents our submission to MultiClinAI{'}s NER subtask for {\#}SMM4H-HeaRD 2026. We focus on the questions 1) which Language Model represents the clinical notes best and 2) which annotations can help training these models. To get answers for these questions, we follow a token-based classification approach with pretrained encoder language models, where we compare models that were pretrained on generic data against medical data, and on a single language, Dutch, against many languages. In addition, we present two data-augmented systems: one with data from the other languages of the workshop for multilingual training, and one with synthetic annotations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arnoult-etal-2026-lotusorchid">
<titleInfo>
<title>LotusOrchid at #SMM4H–HeaRD 2026: Fitting pretrained encoders for Dutch medical data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sophie</namePart>
<namePart type="family">Arnoult</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shutao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piek</namePart>
<namePart type="family">Vossen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Lopez-Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graciela</namePart>
<namePart type="family">Gonzalez-Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-432-3</identifier>
</relatedItem>
<abstract>This paper presents our submission to MultiClinAI’s NER subtask for #SMM4H-HeaRD 2026. We focus on the questions 1) which Language Model represents the clinical notes best and 2) which annotations can help training these models. To get answers for these questions, we follow a token-based classification approach with pretrained encoder language models, where we compare models that were pretrained on generic data against medical data, and on a single language, Dutch, against many languages. In addition, we present two data-augmented systems: one with data from the other languages of the workshop for multilingual training, and one with synthetic annotations.</abstract>
<identifier type="citekey">arnoult-etal-2026-lotusorchid</identifier>
<location>
<url>https://aclanthology.org/2026.smm4h-1.23/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>139</start>
<end>145</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LotusOrchid at #SMM4H–HeaRD 2026: Fitting pretrained encoders for Dutch medical data
%A Arnoult, Sophie
%A Chen, Shutao
%A Vossen, Piek
%Y Lopez-Garcia, Guillermo
%Y Gonzalez-Hernandez, Graciela
%S Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-432-3
%F arnoult-etal-2026-lotusorchid
%X This paper presents our submission to MultiClinAI’s NER subtask for #SMM4H-HeaRD 2026. We focus on the questions 1) which Language Model represents the clinical notes best and 2) which annotations can help training these models. To get answers for these questions, we follow a token-based classification approach with pretrained encoder language models, where we compare models that were pretrained on generic data against medical data, and on a single language, Dutch, against many languages. In addition, we present two data-augmented systems: one with data from the other languages of the workshop for multilingual training, and one with synthetic annotations.
%U https://aclanthology.org/2026.smm4h-1.23/
%P 139-145
Markdown (Informal)
[LotusOrchid at #SMM4H–HeaRD 2026: Fitting pretrained encoders for Dutch medical data](https://aclanthology.org/2026.smm4h-1.23/) (Arnoult et al., SMM4H 2026)
ACL