@inproceedings{verkijk-vossen-2026-tune,
title = "Out-Of-Tune rather than Fine-Tuned: How Pre-training, Fine-tuning and Tokenization Affect Semantic Similarity in a Historical, Non-Standardized Domain",
author = "Verkijk, Stella and
Vossen, Piek",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.45/",
pages = "515--531",
ISBN = "979-8-89176-377-7",
abstract = "Domain-specific encoder language models have been shown to accurately represent semantic distributions as they appear in the pre-training corpus. However, the general consensus is that general language models can adapt to a domain through fine-tuning. Similarly, multilingual models have been shown to leverage transfer learning even for languages that were not present in their pre-training data. Contrastively, tokenization has also been shown to have a great impact on a models' abilities to capture relevant semantic information, while this remains unchanged between pre-training and fine-tuning. This raises the question whether word embeddings for subtokens in models are of sufficient semantic quality for a target domain if not learned for the same domain. In this paper, we compare how different models assign similarity scores to different semantic categories in a highly specialized, non-standardised domain: Early Modern Dutch as written in the archives of the Dutch East India Company. Since the language in this domain is from before spelling conventions were established, and noise accumulates due to the fact that the original handwritten text went through a Handwritten Text Recognition pipeline, this use-case offers a unique opportunity to study both domain-specific semantics as well as a highly complex tokenization task for lesser-resourced languages. Our results support findings in earlier work that fine-tuned models may pick up spurious correlations in the adaptation process and stop relying on relevant semantics learned during pre-training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="verkijk-vossen-2026-tune">
<titleInfo>
<title>Out-Of-Tune rather than Fine-Tuned: How Pre-training, Fine-tuning and Tokenization Affect Semantic Similarity in a Historical, Non-Standardized Domain</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stella</namePart>
<namePart type="family">Verkijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piek</namePart>
<namePart type="family">Vossen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Domain-specific encoder language models have been shown to accurately represent semantic distributions as they appear in the pre-training corpus. However, the general consensus is that general language models can adapt to a domain through fine-tuning. Similarly, multilingual models have been shown to leverage transfer learning even for languages that were not present in their pre-training data. Contrastively, tokenization has also been shown to have a great impact on a models’ abilities to capture relevant semantic information, while this remains unchanged between pre-training and fine-tuning. This raises the question whether word embeddings for subtokens in models are of sufficient semantic quality for a target domain if not learned for the same domain. In this paper, we compare how different models assign similarity scores to different semantic categories in a highly specialized, non-standardised domain: Early Modern Dutch as written in the archives of the Dutch East India Company. Since the language in this domain is from before spelling conventions were established, and noise accumulates due to the fact that the original handwritten text went through a Handwritten Text Recognition pipeline, this use-case offers a unique opportunity to study both domain-specific semantics as well as a highly complex tokenization task for lesser-resourced languages. Our results support findings in earlier work that fine-tuned models may pick up spurious correlations in the adaptation process and stop relying on relevant semantics learned during pre-training.</abstract>
<identifier type="citekey">verkijk-vossen-2026-tune</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.45/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>515</start>
<end>531</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Out-Of-Tune rather than Fine-Tuned: How Pre-training, Fine-tuning and Tokenization Affect Semantic Similarity in a Historical, Non-Standardized Domain
%A Verkijk, Stella
%A Vossen, Piek
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F verkijk-vossen-2026-tune
%X Domain-specific encoder language models have been shown to accurately represent semantic distributions as they appear in the pre-training corpus. However, the general consensus is that general language models can adapt to a domain through fine-tuning. Similarly, multilingual models have been shown to leverage transfer learning even for languages that were not present in their pre-training data. Contrastively, tokenization has also been shown to have a great impact on a models’ abilities to capture relevant semantic information, while this remains unchanged between pre-training and fine-tuning. This raises the question whether word embeddings for subtokens in models are of sufficient semantic quality for a target domain if not learned for the same domain. In this paper, we compare how different models assign similarity scores to different semantic categories in a highly specialized, non-standardised domain: Early Modern Dutch as written in the archives of the Dutch East India Company. Since the language in this domain is from before spelling conventions were established, and noise accumulates due to the fact that the original handwritten text went through a Handwritten Text Recognition pipeline, this use-case offers a unique opportunity to study both domain-specific semantics as well as a highly complex tokenization task for lesser-resourced languages. Our results support findings in earlier work that fine-tuned models may pick up spurious correlations in the adaptation process and stop relying on relevant semantics learned during pre-training.
%U https://aclanthology.org/2026.loreslm-1.45/
%P 515-531
Markdown (Informal)
[Out-Of-Tune rather than Fine-Tuned: How Pre-training, Fine-tuning and Tokenization Affect Semantic Similarity in a Historical, Non-Standardized Domain](https://aclanthology.org/2026.loreslm-1.45/) (Verkijk & Vossen, LoResLM 2026)
ACL