@inproceedings{kohli-etal-2025-choose,
title = "Choose Your Words Wisely: Domain-adaptive Masking Makes Language Models Learn Faster",
author = "Kohli, Vanshpreet S. and
Monis, Aaron and
Mamidi, Radhika",
editor = "Adlakha, Vaibhav and
Chronopoulou, Alexandra and
Li, Xiang Lorraine and
Majumder, Bodhisattwa Prasad and
Shi, Freda and
Vernikos, Giorgos",
booktitle = "Proceedings of the 10th Workshop on Representation Learning for NLP (RepL4NLP-2025)",
month = may,
year = "2025",
address = "Albuquerque, NM",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.repl4nlp-1.6/",
doi = "10.18653/v1/2025.repl4nlp-1.6",
pages = "87--91",
ISBN = "979-8-89176-245-9",
abstract = "Foundational Language Models perform significantly better on downstream tasks in specialised domains (such as law, computer science, and medical science) upon being further pre-trained on extensive domain-specific corpora, but this continual pre-training incurs heavy computational costs. Indeed, some of the most performant specialised language models such as BioBERT incur even higher computing costs during domain-specific training than the pre-training cost of the foundational models they are initialised from. In this paper, we argue that much of the extended pre-training is redundant, with models seemingly wasting valuable resources re-learning lexical and semantic patterns already well-represented in their foundational models such as BERT, T5 and GPT. Focusing on Masked Language Models, we introduce a novel domain-specific masking strategy that is designed to facilitate continual learning while minimizing the training cost. Using this approach, we train and present a BERT-based model trained on a biomedical corpus that matches or surpasses traditionally trained biomedical language models in performance across several downstream classification tasks while incurring up to 11 times lower training costs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kohli-etal-2025-choose">
<titleInfo>
<title>Choose Your Words Wisely: Domain-adaptive Masking Makes Language Models Learn Faster</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vanshpreet</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Kohli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aaron</namePart>
<namePart type="family">Monis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radhika</namePart>
<namePart type="family">Mamidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Representation Learning for NLP (RepL4NLP-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vaibhav</namePart>
<namePart type="family">Adlakha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Chronopoulou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="given">Lorraine</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bodhisattwa</namePart>
<namePart type="given">Prasad</namePart>
<namePart type="family">Majumder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Freda</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giorgos</namePart>
<namePart type="family">Vernikos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, NM</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-245-9</identifier>
</relatedItem>
<abstract>Foundational Language Models perform significantly better on downstream tasks in specialised domains (such as law, computer science, and medical science) upon being further pre-trained on extensive domain-specific corpora, but this continual pre-training incurs heavy computational costs. Indeed, some of the most performant specialised language models such as BioBERT incur even higher computing costs during domain-specific training than the pre-training cost of the foundational models they are initialised from. In this paper, we argue that much of the extended pre-training is redundant, with models seemingly wasting valuable resources re-learning lexical and semantic patterns already well-represented in their foundational models such as BERT, T5 and GPT. Focusing on Masked Language Models, we introduce a novel domain-specific masking strategy that is designed to facilitate continual learning while minimizing the training cost. Using this approach, we train and present a BERT-based model trained on a biomedical corpus that matches or surpasses traditionally trained biomedical language models in performance across several downstream classification tasks while incurring up to 11 times lower training costs.</abstract>
<identifier type="citekey">kohli-etal-2025-choose</identifier>
<identifier type="doi">10.18653/v1/2025.repl4nlp-1.6</identifier>
<location>
<url>https://aclanthology.org/2025.repl4nlp-1.6/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>87</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Choose Your Words Wisely: Domain-adaptive Masking Makes Language Models Learn Faster
%A Kohli, Vanshpreet S.
%A Monis, Aaron
%A Mamidi, Radhika
%Y Adlakha, Vaibhav
%Y Chronopoulou, Alexandra
%Y Li, Xiang Lorraine
%Y Majumder, Bodhisattwa Prasad
%Y Shi, Freda
%Y Vernikos, Giorgos
%S Proceedings of the 10th Workshop on Representation Learning for NLP (RepL4NLP-2025)
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, NM
%@ 979-8-89176-245-9
%F kohli-etal-2025-choose
%X Foundational Language Models perform significantly better on downstream tasks in specialised domains (such as law, computer science, and medical science) upon being further pre-trained on extensive domain-specific corpora, but this continual pre-training incurs heavy computational costs. Indeed, some of the most performant specialised language models such as BioBERT incur even higher computing costs during domain-specific training than the pre-training cost of the foundational models they are initialised from. In this paper, we argue that much of the extended pre-training is redundant, with models seemingly wasting valuable resources re-learning lexical and semantic patterns already well-represented in their foundational models such as BERT, T5 and GPT. Focusing on Masked Language Models, we introduce a novel domain-specific masking strategy that is designed to facilitate continual learning while minimizing the training cost. Using this approach, we train and present a BERT-based model trained on a biomedical corpus that matches or surpasses traditionally trained biomedical language models in performance across several downstream classification tasks while incurring up to 11 times lower training costs.
%R 10.18653/v1/2025.repl4nlp-1.6
%U https://aclanthology.org/2025.repl4nlp-1.6/
%U https://doi.org/10.18653/v1/2025.repl4nlp-1.6
%P 87-91
Markdown (Informal)
[Choose Your Words Wisely: Domain-adaptive Masking Makes Language Models Learn Faster](https://aclanthology.org/2025.repl4nlp-1.6/) (Kohli et al., RepL4NLP 2025)
ACL