@inproceedings{nunes-etal-2026-twenty,
title = "Twenty Years of {HAREM}: A Reproducible Audit and Reassessment of {P}ortuguese Named Entity Recognition",
author = "Nunes, Rafael O. and
Spritzer, Andr{\'e} and
Freitas, Carla M. D. S. and
Balreira, Dennis G.",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.35/",
pages = "351--359",
ISBN = "979-8-89176-387-6",
abstract = "For two decades, the HAREM corpus has served as the foundational benchmark for Portuguese Named Entity Recognition (NER), establishing its evaluation paradigm. Virtually all major progress has been measured against its fixed train/test split. This paper presents the first systematic audit of this split, revealing 153 overlapping (contaminated) sentences. We re-evaluate 13 NER models (ranging from CRFs to Transformers) on both the original and a new, decontaminated version of the corpus. Our statistical analysis reveals that decontamination has a significant (p {\ensuremath{<}} 0.05) and positive impact on the majority of models. We find that performance gains are most pronounced in the F1{\_}text{macro} score (up to +4 points), demonstrating that the contamination primarily harmed generalization on rare entity types. Furthermore, our audit reveals clear evidence of overfitting in some models that benefited from data leakage. We conclude that even minor contamination can distort performance metrics and mask true model generalization. We release our decontaminated benchmark to ensure more reliable future evaluations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nunes-etal-2026-twenty">
<titleInfo>
<title>Twenty Years of HAREM: A Reproducible Audit and Reassessment of Portuguese Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="given">O</namePart>
<namePart type="family">Nunes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">André</namePart>
<namePart type="family">Spritzer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carla</namePart>
<namePart type="given">M</namePart>
<namePart type="given">D</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dennis</namePart>
<namePart type="given">G</namePart>
<namePart type="family">Balreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>For two decades, the HAREM corpus has served as the foundational benchmark for Portuguese Named Entity Recognition (NER), establishing its evaluation paradigm. Virtually all major progress has been measured against its fixed train/test split. This paper presents the first systematic audit of this split, revealing 153 overlapping (contaminated) sentences. We re-evaluate 13 NER models (ranging from CRFs to Transformers) on both the original and a new, decontaminated version of the corpus. Our statistical analysis reveals that decontamination has a significant (p \ensuremath< 0.05) and positive impact on the majority of models. We find that performance gains are most pronounced in the F1_textmacro score (up to +4 points), demonstrating that the contamination primarily harmed generalization on rare entity types. Furthermore, our audit reveals clear evidence of overfitting in some models that benefited from data leakage. We conclude that even minor contamination can distort performance metrics and mask true model generalization. We release our decontaminated benchmark to ensure more reliable future evaluations.</abstract>
<identifier type="citekey">nunes-etal-2026-twenty</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.35/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>351</start>
<end>359</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Twenty Years of HAREM: A Reproducible Audit and Reassessment of Portuguese Named Entity Recognition
%A Nunes, Rafael O.
%A Spritzer, André
%A Freitas, Carla M. D. S.
%A Balreira, Dennis G.
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F nunes-etal-2026-twenty
%X For two decades, the HAREM corpus has served as the foundational benchmark for Portuguese Named Entity Recognition (NER), establishing its evaluation paradigm. Virtually all major progress has been measured against its fixed train/test split. This paper presents the first systematic audit of this split, revealing 153 overlapping (contaminated) sentences. We re-evaluate 13 NER models (ranging from CRFs to Transformers) on both the original and a new, decontaminated version of the corpus. Our statistical analysis reveals that decontamination has a significant (p \ensuremath< 0.05) and positive impact on the majority of models. We find that performance gains are most pronounced in the F1_textmacro score (up to +4 points), demonstrating that the contamination primarily harmed generalization on rare entity types. Furthermore, our audit reveals clear evidence of overfitting in some models that benefited from data leakage. We conclude that even minor contamination can distort performance metrics and mask true model generalization. We release our decontaminated benchmark to ensure more reliable future evaluations.
%U https://aclanthology.org/2026.propor-1.35/
%P 351-359
Markdown (Informal)
[Twenty Years of HAREM: A Reproducible Audit and Reassessment of Portuguese Named Entity Recognition](https://aclanthology.org/2026.propor-1.35/) (Nunes et al., PROPOR 2026)
ACL