@inproceedings{mahdoubi-etal-2026-data,
title = "Data Augmentation Based on Selective Masking of Language Models for One Health Context",
author = "Mahdoubi, Youssef and
Idrissi, Najlae and
Roche, Mathieu and
Valentin, Sarah",
editor = {Danilova, Vera and
Kurfal{\i}, Murathan and
S{\"o}derfeldt, Ylva and
Reed, Julia and
Burchell, Andrew},
booktitle = "Proceedings of the 1st Workshop on Linguistic Analysis for Health ({H}ea{L}ing 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.healing-1.23/",
pages = "266--276",
ISBN = "979-8-89176-367-8",
abstract = "This study focuses on improving the performance of language models for two critical applications within the One Health context, specifically in epidemiological monitoring using textual data: (i) thematic classification across syndromic surveillance, biomedical and plant health domains, and (ii) detection of epidemic misinformation. A key challenge in these tasks is the limited availability of labeled textual data, which constrains the effectiveness of supervised learning methods. To overcome this limitation, we introduce two families of selective masking{--}based data augmentation strategies: lexical and non-lexical. Each family is implemented in a standard variant (Aug-SM-Lex and Aug-SM-NonLex), and a TF-IDF-weighted variant (Aug-SM-Lex-TFIDF and Aug-SM-NonLex-TFIDF). We perform two complementary experiments: the first determines the optimal masking rate, while the second evaluates the proposed strategies against LLM-based text reformulation. Experimental results indicate that selective masking-based augmentation outperformed both LLM-based reformulation (Mistral-7B and GPT-Neo-1.3B) and baseline models trained on original data alone across three of the five evaluated datasets, with the best performance achieved at a masking rate of 20{\%}. This suggests that selective masking is a promising approach, potentially more effective than computationally expensive LLM-based reformulation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mahdoubi-etal-2026-data">
<titleInfo>
<title>Data Augmentation Based on Selective Masking of Language Models for One Health Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Youssef</namePart>
<namePart type="family">Mahdoubi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Najlae</namePart>
<namePart type="family">Idrissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Roche</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Valentin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Linguistic Analysis for Health (HeaLing 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Danilova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Murathan</namePart>
<namePart type="family">Kurfalı</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ylva</namePart>
<namePart type="family">Söderfeldt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Reed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Burchell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-367-8</identifier>
</relatedItem>
<abstract>This study focuses on improving the performance of language models for two critical applications within the One Health context, specifically in epidemiological monitoring using textual data: (i) thematic classification across syndromic surveillance, biomedical and plant health domains, and (ii) detection of epidemic misinformation. A key challenge in these tasks is the limited availability of labeled textual data, which constrains the effectiveness of supervised learning methods. To overcome this limitation, we introduce two families of selective masking–based data augmentation strategies: lexical and non-lexical. Each family is implemented in a standard variant (Aug-SM-Lex and Aug-SM-NonLex), and a TF-IDF-weighted variant (Aug-SM-Lex-TFIDF and Aug-SM-NonLex-TFIDF). We perform two complementary experiments: the first determines the optimal masking rate, while the second evaluates the proposed strategies against LLM-based text reformulation. Experimental results indicate that selective masking-based augmentation outperformed both LLM-based reformulation (Mistral-7B and GPT-Neo-1.3B) and baseline models trained on original data alone across three of the five evaluated datasets, with the best performance achieved at a masking rate of 20%. This suggests that selective masking is a promising approach, potentially more effective than computationally expensive LLM-based reformulation.</abstract>
<identifier type="citekey">mahdoubi-etal-2026-data</identifier>
<location>
<url>https://aclanthology.org/2026.healing-1.23/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>266</start>
<end>276</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Augmentation Based on Selective Masking of Language Models for One Health Context
%A Mahdoubi, Youssef
%A Idrissi, Najlae
%A Roche, Mathieu
%A Valentin, Sarah
%Y Danilova, Vera
%Y Kurfalı, Murathan
%Y Söderfeldt, Ylva
%Y Reed, Julia
%Y Burchell, Andrew
%S Proceedings of the 1st Workshop on Linguistic Analysis for Health (HeaLing 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-367-8
%F mahdoubi-etal-2026-data
%X This study focuses on improving the performance of language models for two critical applications within the One Health context, specifically in epidemiological monitoring using textual data: (i) thematic classification across syndromic surveillance, biomedical and plant health domains, and (ii) detection of epidemic misinformation. A key challenge in these tasks is the limited availability of labeled textual data, which constrains the effectiveness of supervised learning methods. To overcome this limitation, we introduce two families of selective masking–based data augmentation strategies: lexical and non-lexical. Each family is implemented in a standard variant (Aug-SM-Lex and Aug-SM-NonLex), and a TF-IDF-weighted variant (Aug-SM-Lex-TFIDF and Aug-SM-NonLex-TFIDF). We perform two complementary experiments: the first determines the optimal masking rate, while the second evaluates the proposed strategies against LLM-based text reformulation. Experimental results indicate that selective masking-based augmentation outperformed both LLM-based reformulation (Mistral-7B and GPT-Neo-1.3B) and baseline models trained on original data alone across three of the five evaluated datasets, with the best performance achieved at a masking rate of 20%. This suggests that selective masking is a promising approach, potentially more effective than computationally expensive LLM-based reformulation.
%U https://aclanthology.org/2026.healing-1.23/
%P 266-276
Markdown (Informal)
[Data Augmentation Based on Selective Masking of Language Models for One Health Context](https://aclanthology.org/2026.healing-1.23/) (Mahdoubi et al., HeaLing 2026)
ACL