@inproceedings{santos-etal-2026-lexiris,
title = "{L}ex{I}ris-pt and {L}ex{B}ert-pt: Specialized Sentence Embeddings for Legal Similarity in {B}razilian {P}ortuguese",
author = "Santos, Willgnner Ferreira and
Viana, Jo{\~a}o Gabriel Grandotto and
J{\'u}nior, Ant{\^o}nio Pires de Castro and
Trindade, Fernando Ribeiro and
Silva, N{\'a}dia F{\'e}lix Felipe da",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.53/",
pages = "540--550",
ISBN = "979-8-89176-387-6",
abstract = "This work presents and evaluates two specialized sentence embedding models for the Portuguese legal domain, LexIris-pt and LexBert-pt, obtained through supervised fine-tuning of BERT-based models using pairs of initial petitions. We propose a comparative evaluation protocol along three fronts: (i) zero-shot inference with pretrained embeddings, (ii) supervised fine-tuning on these pairs, and (iii) vector retrieval with incremental clustering over a corpus of 20,000 initial petitions. The results show that fine-tuning consistently increases correlations with reference scores and improves performance in vector retrieval; additionally, the vector retrieval stage indicates that the metric configured in the index (cosine similarity or inner product) can change the granularity of the partitioning under a fixed threshold, reinforcing the need for joint calibration among the encoder, metric and threshold. After auditing by specialists from the partner institution, LexIris-pt and LexBert-pt were operationally adopted to support the screening and organization of repetitive claims and predatory litigation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="santos-etal-2026-lexiris">
<titleInfo>
<title>LexIris-pt and LexBert-pt: Specialized Sentence Embeddings for Legal Similarity in Brazilian Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Willgnner</namePart>
<namePart type="given">Ferreira</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="given">Gabriel</namePart>
<namePart type="given">Grandotto</namePart>
<namePart type="family">Viana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antônio</namePart>
<namePart type="given">Pires</namePart>
<namePart type="given">de</namePart>
<namePart type="given">Castro</namePart>
<namePart type="family">Júnior</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="given">Ribeiro</namePart>
<namePart type="family">Trindade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nádia</namePart>
<namePart type="given">Félix</namePart>
<namePart type="given">Felipe</namePart>
<namePart type="given">da</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>This work presents and evaluates two specialized sentence embedding models for the Portuguese legal domain, LexIris-pt and LexBert-pt, obtained through supervised fine-tuning of BERT-based models using pairs of initial petitions. We propose a comparative evaluation protocol along three fronts: (i) zero-shot inference with pretrained embeddings, (ii) supervised fine-tuning on these pairs, and (iii) vector retrieval with incremental clustering over a corpus of 20,000 initial petitions. The results show that fine-tuning consistently increases correlations with reference scores and improves performance in vector retrieval; additionally, the vector retrieval stage indicates that the metric configured in the index (cosine similarity or inner product) can change the granularity of the partitioning under a fixed threshold, reinforcing the need for joint calibration among the encoder, metric and threshold. After auditing by specialists from the partner institution, LexIris-pt and LexBert-pt were operationally adopted to support the screening and organization of repetitive claims and predatory litigation.</abstract>
<identifier type="citekey">santos-etal-2026-lexiris</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.53/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>540</start>
<end>550</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LexIris-pt and LexBert-pt: Specialized Sentence Embeddings for Legal Similarity in Brazilian Portuguese
%A Santos, Willgnner Ferreira
%A Viana, João Gabriel Grandotto
%A Júnior, Antônio Pires de Castro
%A Trindade, Fernando Ribeiro
%A Silva, Nádia Félix Felipe da
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F santos-etal-2026-lexiris
%X This work presents and evaluates two specialized sentence embedding models for the Portuguese legal domain, LexIris-pt and LexBert-pt, obtained through supervised fine-tuning of BERT-based models using pairs of initial petitions. We propose a comparative evaluation protocol along three fronts: (i) zero-shot inference with pretrained embeddings, (ii) supervised fine-tuning on these pairs, and (iii) vector retrieval with incremental clustering over a corpus of 20,000 initial petitions. The results show that fine-tuning consistently increases correlations with reference scores and improves performance in vector retrieval; additionally, the vector retrieval stage indicates that the metric configured in the index (cosine similarity or inner product) can change the granularity of the partitioning under a fixed threshold, reinforcing the need for joint calibration among the encoder, metric and threshold. After auditing by specialists from the partner institution, LexIris-pt and LexBert-pt were operationally adopted to support the screening and organization of repetitive claims and predatory litigation.
%U https://aclanthology.org/2026.propor-1.53/
%P 540-550
Markdown (Informal)
[LexIris-pt and LexBert-pt: Specialized Sentence Embeddings for Legal Similarity in Brazilian Portuguese](https://aclanthology.org/2026.propor-1.53/) (Santos et al., PROPOR 2026)
ACL