@inproceedings{voinea-2026-validator,
title = "Validator-Guided Hard Negative Mining for Masked Language Modeling in Low-Resource Ancient Languages",
author = "Voinea, Andrei",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-srw.69/",
pages = "779--790",
ISBN = "979-8-89176-393-7",
abstract = "Masked language modeling for low-resource ancient languages remains challenging because pre-trained multilingual models lack exposure to these languages. We investigate rule-based linguistic constraints and hard negative mining for Sumerian, a language isolate not included in multilingual BERT{'}s training data. We build a hierarchical validator capturing subword, word, and part-of-speech patterns from 4,545 annotated sequences, using it to filter candidates and identify hard negatives for fine-tuning. Vanilla mBERT achieves 18.0{\%} hit@10 accuracy. The validator alone improves this to 72.8{\%}, while hard negative fine-tuning reaches 78.3{\%}. Combining both yields 86.7{\%}, a 68.7 percentage point improvement. Temporal generalization evaluation on tablets from 600 years earlier shows that both the hard negative mining and the validator alone improve performance, but the combined approach underperforms due to the validator{'}s period specific rules. These findings demonstrate that hard negative mining transfers across periods while explicit rule-based constraints provide strong in-domain improvements but limited cross-period generalization."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="voinea-2026-validator">
<titleInfo>
<title>Validator-Guided Hard Negative Mining for Masked Language Modeling in Low-Resource Ancient Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrei</namePart>
<namePart type="family">Voinea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.Y.S.S.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Diego</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">de Gibert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-393-7</identifier>
</relatedItem>
<abstract>Masked language modeling for low-resource ancient languages remains challenging because pre-trained multilingual models lack exposure to these languages. We investigate rule-based linguistic constraints and hard negative mining for Sumerian, a language isolate not included in multilingual BERT’s training data. We build a hierarchical validator capturing subword, word, and part-of-speech patterns from 4,545 annotated sequences, using it to filter candidates and identify hard negatives for fine-tuning. Vanilla mBERT achieves 18.0% hit@10 accuracy. The validator alone improves this to 72.8%, while hard negative fine-tuning reaches 78.3%. Combining both yields 86.7%, a 68.7 percentage point improvement. Temporal generalization evaluation on tablets from 600 years earlier shows that both the hard negative mining and the validator alone improve performance, but the combined approach underperforms due to the validator’s period specific rules. These findings demonstrate that hard negative mining transfers across periods while explicit rule-based constraints provide strong in-domain improvements but limited cross-period generalization.</abstract>
<identifier type="citekey">voinea-2026-validator</identifier>
<location>
<url>https://aclanthology.org/2026.acl-srw.69/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>779</start>
<end>790</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Validator-Guided Hard Negative Mining for Masked Language Modeling in Low-Resource Ancient Languages
%A Voinea, Andrei
%Y T.Y.S.S., Santosh
%Y Rodriguez, Juan Diego
%Y de Gibert, Ona
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-393-7
%F voinea-2026-validator
%X Masked language modeling for low-resource ancient languages remains challenging because pre-trained multilingual models lack exposure to these languages. We investigate rule-based linguistic constraints and hard negative mining for Sumerian, a language isolate not included in multilingual BERT’s training data. We build a hierarchical validator capturing subword, word, and part-of-speech patterns from 4,545 annotated sequences, using it to filter candidates and identify hard negatives for fine-tuning. Vanilla mBERT achieves 18.0% hit@10 accuracy. The validator alone improves this to 72.8%, while hard negative fine-tuning reaches 78.3%. Combining both yields 86.7%, a 68.7 percentage point improvement. Temporal generalization evaluation on tablets from 600 years earlier shows that both the hard negative mining and the validator alone improve performance, but the combined approach underperforms due to the validator’s period specific rules. These findings demonstrate that hard negative mining transfers across periods while explicit rule-based constraints provide strong in-domain improvements but limited cross-period generalization.
%U https://aclanthology.org/2026.acl-srw.69/
%P 779-790
Markdown (Informal)
[Validator-Guided Hard Negative Mining for Masked Language Modeling in Low-Resource Ancient Languages](https://aclanthology.org/2026.acl-srw.69/) (Voinea, ACL 2026)
ACL