@inproceedings{tepei-bloem-2025-linguistically,
title = "A Linguistically-informed Comparison between Multilingual {BERT} and Language-specific {BERT} Models: The Case of Differential Object Marking in {R}omanian",
author = "Tepei, Maria and
Bloem, Jelke",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.147/",
pages = "1271--1281",
abstract = "Current linguistic challenge datasets for language models focus on phenomena that exist in English. This may lead to a lack of attention for typological features beyond English. This is particularly an issue for multilingual models, which may be biased towards English by their training data and this bias may be amplified if benchmarks are also English-centered. We present the syntactically and semantically complex language phenomenon of Differential Object Marking (DOM) in Romanian as a challenging Masked Language Modelling task and compare the performance of monolingual and multilingual models. Results indicate that Romanian-specific BERT models perform better than equivalent multilingual one in representing this phenomenon."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tepei-bloem-2025-linguistically">
<titleInfo>
<title>A Linguistically-informed Comparison between Multilingual BERT and Language-specific BERT Models: The Case of Differential Object Marking in Romanian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Tepei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jelke</namePart>
<namePart type="family">Bloem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current linguistic challenge datasets for language models focus on phenomena that exist in English. This may lead to a lack of attention for typological features beyond English. This is particularly an issue for multilingual models, which may be biased towards English by their training data and this bias may be amplified if benchmarks are also English-centered. We present the syntactically and semantically complex language phenomenon of Differential Object Marking (DOM) in Romanian as a challenging Masked Language Modelling task and compare the performance of monolingual and multilingual models. Results indicate that Romanian-specific BERT models perform better than equivalent multilingual one in representing this phenomenon.</abstract>
<identifier type="citekey">tepei-bloem-2025-linguistically</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.147/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>1271</start>
<end>1281</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Linguistically-informed Comparison between Multilingual BERT and Language-specific BERT Models: The Case of Differential Object Marking in Romanian
%A Tepei, Maria
%A Bloem, Jelke
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F tepei-bloem-2025-linguistically
%X Current linguistic challenge datasets for language models focus on phenomena that exist in English. This may lead to a lack of attention for typological features beyond English. This is particularly an issue for multilingual models, which may be biased towards English by their training data and this bias may be amplified if benchmarks are also English-centered. We present the syntactically and semantically complex language phenomenon of Differential Object Marking (DOM) in Romanian as a challenging Masked Language Modelling task and compare the performance of monolingual and multilingual models. Results indicate that Romanian-specific BERT models perform better than equivalent multilingual one in representing this phenomenon.
%U https://aclanthology.org/2025.ranlp-1.147/
%P 1271-1281
Markdown (Informal)
[A Linguistically-informed Comparison between Multilingual BERT and Language-specific BERT Models: The Case of Differential Object Marking in Romanian](https://aclanthology.org/2025.ranlp-1.147/) (Tepei & Bloem, RANLP 2025)
ACL