@inproceedings{ursu-soare-2026-metaminers,
title = "{M}eta{M}iners at {SMM}4{H}-{H}ea{RD} 2026: A Semantic-Structural Knowledge-Enriched Ensemble for {SARS}-{C}o{V}-2 Metadata Identification",
author = "Ursu, Claudia-Alexandra and
Soare, Alecsandru-Florin",
editor = "Lopez-Garcia, Guillermo and
Gonzalez-Hernandez, Graciela",
booktitle = "Proceedings of the 11th Social Media Mining for Health Research and Applications ({SMM}4{H}-{H}ea{RD} 2026) Workshop and Shared Tasks",
month = jul,
year = "2026",
address = "San Diego, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.smm4h-1.45/",
pages = "272--279",
ISBN = "979-8-89176-432-3",
abstract = "This paper presents a hybrid solution for a binary classification of medical PubMed articles created for identifying reports that associate clinical metadata with SARS-CoV-2 genomic sequences. The system is designed to catch the subtle distinction between reports of sequence-associated patient metadata and sentences where such metadata is either unrelated, irellevant, or linked to previous studies. The biggest challenge is the fact that the medical dataset is highly imbalanced, consisting of only 13.3 {\%} of medical reports labeled positive.Our system proposes a hybrid system that combines 4 approaches that includes dual-evidence tagging, negation-aware suppression, semantic frame extraction, adversarial training. All these approaches were tested on multiple models: BiomedBERT-base-abstract, BioLinkBERT-large, PubMedBERT-base-fulltext, followed by a best subset ensamble search to obtain the result of 0.792 F1 score, setting a new benchmark and positioning the solution on the 1st place of the competition."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ursu-soare-2026-metaminers">
<titleInfo>
<title>MetaMiners at SMM4H-HeaRD 2026: A Semantic-Structural Knowledge-Enriched Ensemble for SARS-CoV-2 Metadata Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudia-Alexandra</namePart>
<namePart type="family">Ursu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alecsandru-Florin</namePart>
<namePart type="family">Soare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Lopez-Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graciela</namePart>
<namePart type="family">Gonzalez-Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-432-3</identifier>
</relatedItem>
<abstract>This paper presents a hybrid solution for a binary classification of medical PubMed articles created for identifying reports that associate clinical metadata with SARS-CoV-2 genomic sequences. The system is designed to catch the subtle distinction between reports of sequence-associated patient metadata and sentences where such metadata is either unrelated, irellevant, or linked to previous studies. The biggest challenge is the fact that the medical dataset is highly imbalanced, consisting of only 13.3 % of medical reports labeled positive.Our system proposes a hybrid system that combines 4 approaches that includes dual-evidence tagging, negation-aware suppression, semantic frame extraction, adversarial training. All these approaches were tested on multiple models: BiomedBERT-base-abstract, BioLinkBERT-large, PubMedBERT-base-fulltext, followed by a best subset ensamble search to obtain the result of 0.792 F1 score, setting a new benchmark and positioning the solution on the 1st place of the competition.</abstract>
<identifier type="citekey">ursu-soare-2026-metaminers</identifier>
<location>
<url>https://aclanthology.org/2026.smm4h-1.45/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>272</start>
<end>279</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MetaMiners at SMM4H-HeaRD 2026: A Semantic-Structural Knowledge-Enriched Ensemble for SARS-CoV-2 Metadata Identification
%A Ursu, Claudia-Alexandra
%A Soare, Alecsandru-Florin
%Y Lopez-Garcia, Guillermo
%Y Gonzalez-Hernandez, Graciela
%S Proceedings of the 11th Social Media Mining for Health Research and Applications (SMM4H-HeaRD 2026) Workshop and Shared Tasks
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, United States
%@ 979-8-89176-432-3
%F ursu-soare-2026-metaminers
%X This paper presents a hybrid solution for a binary classification of medical PubMed articles created for identifying reports that associate clinical metadata with SARS-CoV-2 genomic sequences. The system is designed to catch the subtle distinction between reports of sequence-associated patient metadata and sentences where such metadata is either unrelated, irellevant, or linked to previous studies. The biggest challenge is the fact that the medical dataset is highly imbalanced, consisting of only 13.3 % of medical reports labeled positive.Our system proposes a hybrid system that combines 4 approaches that includes dual-evidence tagging, negation-aware suppression, semantic frame extraction, adversarial training. All these approaches were tested on multiple models: BiomedBERT-base-abstract, BioLinkBERT-large, PubMedBERT-base-fulltext, followed by a best subset ensamble search to obtain the result of 0.792 F1 score, setting a new benchmark and positioning the solution on the 1st place of the competition.
%U https://aclanthology.org/2026.smm4h-1.45/
%P 272-279
Markdown (Informal)
[MetaMiners at SMM4H-HeaRD 2026: A Semantic-Structural Knowledge-Enriched Ensemble for SARS-CoV-2 Metadata Identification](https://aclanthology.org/2026.smm4h-1.45/) (Ursu & Soare, SMM4H 2026)
ACL