@inproceedings{porto-etal-2026-jabuticabert,
title = "{J}abutica{BERT}: {M}odern {P}ortuguese Encoders from Scratch with {RTD} and Long-Context Training",
author = "Porto, Thiago and
Gomes, Gabriel and
Bender, Alexandre and
Corr{\^e}a, Ulisses and
Freitas, Larissa and
Cruz, William and
Amadeus, Marcellus",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.93/",
pages = "933--942",
ISBN = "979-8-89176-387-6",
abstract = "Encoder-based language models remain essential for natural language understanding tasks such as classification, semantic similarity, and retrieval-augmented generation. However, the lack of high-quality monolingual encoders for Brazilian Portuguese poses a significant challenge to performance. In this work, we systematically explore the training of Portuguese-specific encoder models from scratch using two modern architectures: DeBERTa, trained with Replaced Token Detection (RTD), and ModernBERT, trained with Masked Language Modeling (MLM). All models are pre-trained on the large-scale Jabuticaba corpus. Our DeBERTa-Large model achieves results comparable to the state-of-the-art, with F1 scores of 0.920 on ASSIN2 RTE and 0.915 on LeNER. Crucially, it matches the performance of the 900M-parameter Albertina model while utilizing significantly fewer parameters. We also release custom tokenizers that reduce token fertility rates compared to multilingual baselines. These findings provide evidence that careful architectural choices and monolingual tokenization can yield competitive performance without massive model scaling."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="porto-etal-2026-jabuticabert">
<titleInfo>
<title>JabuticaBERT: Modern Portuguese Encoders from Scratch with RTD and Long-Context Training</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thiago</namePart>
<namePart type="family">Porto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Gomes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Bender</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ulisses</namePart>
<namePart type="family">Corrêa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Cruz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcellus</namePart>
<namePart type="family">Amadeus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Encoder-based language models remain essential for natural language understanding tasks such as classification, semantic similarity, and retrieval-augmented generation. However, the lack of high-quality monolingual encoders for Brazilian Portuguese poses a significant challenge to performance. In this work, we systematically explore the training of Portuguese-specific encoder models from scratch using two modern architectures: DeBERTa, trained with Replaced Token Detection (RTD), and ModernBERT, trained with Masked Language Modeling (MLM). All models are pre-trained on the large-scale Jabuticaba corpus. Our DeBERTa-Large model achieves results comparable to the state-of-the-art, with F1 scores of 0.920 on ASSIN2 RTE and 0.915 on LeNER. Crucially, it matches the performance of the 900M-parameter Albertina model while utilizing significantly fewer parameters. We also release custom tokenizers that reduce token fertility rates compared to multilingual baselines. These findings provide evidence that careful architectural choices and monolingual tokenization can yield competitive performance without massive model scaling.</abstract>
<identifier type="citekey">porto-etal-2026-jabuticabert</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.93/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>933</start>
<end>942</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T JabuticaBERT: Modern Portuguese Encoders from Scratch with RTD and Long-Context Training
%A Porto, Thiago
%A Gomes, Gabriel
%A Bender, Alexandre
%A Corrêa, Ulisses
%A Freitas, Larissa
%A Cruz, William
%A Amadeus, Marcellus
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F porto-etal-2026-jabuticabert
%X Encoder-based language models remain essential for natural language understanding tasks such as classification, semantic similarity, and retrieval-augmented generation. However, the lack of high-quality monolingual encoders for Brazilian Portuguese poses a significant challenge to performance. In this work, we systematically explore the training of Portuguese-specific encoder models from scratch using two modern architectures: DeBERTa, trained with Replaced Token Detection (RTD), and ModernBERT, trained with Masked Language Modeling (MLM). All models are pre-trained on the large-scale Jabuticaba corpus. Our DeBERTa-Large model achieves results comparable to the state-of-the-art, with F1 scores of 0.920 on ASSIN2 RTE and 0.915 on LeNER. Crucially, it matches the performance of the 900M-parameter Albertina model while utilizing significantly fewer parameters. We also release custom tokenizers that reduce token fertility rates compared to multilingual baselines. These findings provide evidence that careful architectural choices and monolingual tokenization can yield competitive performance without massive model scaling.
%U https://aclanthology.org/2026.propor-1.93/
%P 933-942
Markdown (Informal)
[JabuticaBERT: Modern Portuguese Encoders from Scratch with RTD and Long-Context Training](https://aclanthology.org/2026.propor-1.93/) (Porto et al., PROPOR 2026)
ACL