@inproceedings{osorio-lopes-cardoso-2026-portoldbert,
title = "{P}ort{O}ld{BERT}: {P}ortuguese Historical Language Models",
author = "Osorio, Tomas Freitas and
Lopes Cardoso, Henrique",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.123/",
pages = "2691--2705",
ISBN = "979-8-89176-380-7",
abstract = "Historical language models play a crucial role in the study of languages, and can benefit tasks such as named-entity recognition (NER), part-of-speech (PoS) tagging, and post-OCR correction, among others. Despite their relevance, most efforts have been concentrated on English. To the best of our knowledge, no such model exists for historical Portuguese. In this work, we introduce PortOldBERT, the first historical Portuguese encoder language model. We demonstrate its usefulness by comparing PortOldBERT{'}s performance with Albertina, the encoder on which it is based, across multiple tasks{---}pseudo-perplexity, NER, PoS tagging, word error rate (WER) prediction, and OCR error detection{---}and for different historical periods. PortOldBERT consistently outperforms Albertina in historical data, demonstrating its ability to effectively integrate historical linguistic contexts while retaining the ability to process contemporary text."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="osorio-lopes-cardoso-2026-portoldbert">
<titleInfo>
<title>PortOldBERT: Portuguese Historical Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomas</namePart>
<namePart type="given">Freitas</namePart>
<namePart type="family">Osorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henrique</namePart>
<namePart type="family">Lopes Cardoso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Historical language models play a crucial role in the study of languages, and can benefit tasks such as named-entity recognition (NER), part-of-speech (PoS) tagging, and post-OCR correction, among others. Despite their relevance, most efforts have been concentrated on English. To the best of our knowledge, no such model exists for historical Portuguese. In this work, we introduce PortOldBERT, the first historical Portuguese encoder language model. We demonstrate its usefulness by comparing PortOldBERT’s performance with Albertina, the encoder on which it is based, across multiple tasks—pseudo-perplexity, NER, PoS tagging, word error rate (WER) prediction, and OCR error detection—and for different historical periods. PortOldBERT consistently outperforms Albertina in historical data, demonstrating its ability to effectively integrate historical linguistic contexts while retaining the ability to process contemporary text.</abstract>
<identifier type="citekey">osorio-lopes-cardoso-2026-portoldbert</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.123/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2691</start>
<end>2705</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PortOldBERT: Portuguese Historical Language Models
%A Osorio, Tomas Freitas
%A Lopes Cardoso, Henrique
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F osorio-lopes-cardoso-2026-portoldbert
%X Historical language models play a crucial role in the study of languages, and can benefit tasks such as named-entity recognition (NER), part-of-speech (PoS) tagging, and post-OCR correction, among others. Despite their relevance, most efforts have been concentrated on English. To the best of our knowledge, no such model exists for historical Portuguese. In this work, we introduce PortOldBERT, the first historical Portuguese encoder language model. We demonstrate its usefulness by comparing PortOldBERT’s performance with Albertina, the encoder on which it is based, across multiple tasks—pseudo-perplexity, NER, PoS tagging, word error rate (WER) prediction, and OCR error detection—and for different historical periods. PortOldBERT consistently outperforms Albertina in historical data, demonstrating its ability to effectively integrate historical linguistic contexts while retaining the ability to process contemporary text.
%U https://aclanthology.org/2026.eacl-long.123/
%P 2691-2705
Markdown (Informal)
[PortOldBERT: Portuguese Historical Language Models](https://aclanthology.org/2026.eacl-long.123/) (Osorio & Lopes Cardoso, EACL 2026)
ACL
- Tomas Freitas Osorio and Henrique Lopes Cardoso. 2026. PortOldBERT: Portuguese Historical Language Models. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 2691–2705, Rabat, Morocco. Association for Computational Linguistics.