@inproceedings{lopez-otal-gracia-2026-language,
title = "``We Are (Language) Family'': Adapting Transformer models to related minority languages with linguistic data",
author = "L{\'o}pez-Otal, Miguel and
Gracia, Jorge",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Plum, Alistair and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the Second Workshop on Language Models for Low-Resource Languages ({L}o{R}es{LM} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.loreslm-1.26/",
pages = "297--310",
ISBN = "979-8-89176-377-7",
abstract = "Transformer-based language models, despite their widespread use, remain mostly unavailable for low-resourced languages (LRLs), due to their lack of texts for pre-training. While solutions have emerged to remedy this, they still almost exclusively rely on raw text corpora, which may be almost non-existent for some languages. A recent line of work has attempted to circumvent this by replacing these with linguistics-based materials, such as grammars, to adapt LRLs to these models. However, many approaches tend to work with languages that are typologically very distant to each other.In this work we investigate whether adapting closely related languages, belonging to the same family, with linguistics-based data can facilitate this process. For this, we look into the adaptation of two Spanish-based Transformer encoders {--}a monolingual and multilingual models{--} to Aragonese, a low-resourced Romance language spoken in Northern Spain, with whom it shares similar syntax but differing lexical and morphological phenomena. We rely on several knowledge injection methods, with which we report results, for a monolingual model, above some baselines in a set of Natural Language Understanding (NLU) benchmarks, proving the efficiency of relying on linguistics materials {--}or combined with a small amount of text{--} when languages belong to the same family."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lopez-otal-gracia-2026-language">
<titleInfo>
<title>“We Are (Language) Family”: Adapting Transformer models to related minority languages with linguistic data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miguel</namePart>
<namePart type="family">López-Otal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Gracia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Plum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-377-7</identifier>
</relatedItem>
<abstract>Transformer-based language models, despite their widespread use, remain mostly unavailable for low-resourced languages (LRLs), due to their lack of texts for pre-training. While solutions have emerged to remedy this, they still almost exclusively rely on raw text corpora, which may be almost non-existent for some languages. A recent line of work has attempted to circumvent this by replacing these with linguistics-based materials, such as grammars, to adapt LRLs to these models. However, many approaches tend to work with languages that are typologically very distant to each other.In this work we investigate whether adapting closely related languages, belonging to the same family, with linguistics-based data can facilitate this process. For this, we look into the adaptation of two Spanish-based Transformer encoders –a monolingual and multilingual models– to Aragonese, a low-resourced Romance language spoken in Northern Spain, with whom it shares similar syntax but differing lexical and morphological phenomena. We rely on several knowledge injection methods, with which we report results, for a monolingual model, above some baselines in a set of Natural Language Understanding (NLU) benchmarks, proving the efficiency of relying on linguistics materials –or combined with a small amount of text– when languages belong to the same family.</abstract>
<identifier type="citekey">lopez-otal-gracia-2026-language</identifier>
<location>
<url>https://aclanthology.org/2026.loreslm-1.26/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>297</start>
<end>310</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T “We Are (Language) Family”: Adapting Transformer models to related minority languages with linguistic data
%A López-Otal, Miguel
%A Gracia, Jorge
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Plum, Alistair
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the Second Workshop on Language Models for Low-Resource Languages (LoResLM 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-377-7
%F lopez-otal-gracia-2026-language
%X Transformer-based language models, despite their widespread use, remain mostly unavailable for low-resourced languages (LRLs), due to their lack of texts for pre-training. While solutions have emerged to remedy this, they still almost exclusively rely on raw text corpora, which may be almost non-existent for some languages. A recent line of work has attempted to circumvent this by replacing these with linguistics-based materials, such as grammars, to adapt LRLs to these models. However, many approaches tend to work with languages that are typologically very distant to each other.In this work we investigate whether adapting closely related languages, belonging to the same family, with linguistics-based data can facilitate this process. For this, we look into the adaptation of two Spanish-based Transformer encoders –a monolingual and multilingual models– to Aragonese, a low-resourced Romance language spoken in Northern Spain, with whom it shares similar syntax but differing lexical and morphological phenomena. We rely on several knowledge injection methods, with which we report results, for a monolingual model, above some baselines in a set of Natural Language Understanding (NLU) benchmarks, proving the efficiency of relying on linguistics materials –or combined with a small amount of text– when languages belong to the same family.
%U https://aclanthology.org/2026.loreslm-1.26/
%P 297-310
Markdown (Informal)
["We Are (Language) Family”: Adapting Transformer models to related minority languages with linguistic data](https://aclanthology.org/2026.loreslm-1.26/) (López-Otal & Gracia, LoResLM 2026)
ACL