@inproceedings{moreira-etal-2026-anatomy,
title = "Anatomy of Data Repositories for the Analysis and Detection of Toxicity in {P}ortuguese",
author = "Moreira, Lorena Souza and
Gibrim, Paula Teresa M. and
Rocha, Leonardo and
Reis, Julio C. S.",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.45/",
pages = "456--466",
ISBN = "979-8-89176-387-6",
abstract = "The proliferation of online hate speech requires a rigorous examination of the datasets used to train detection models. In this work, we analyze six Brazilian Portuguese datasets annotated for hate speech or toxicity to investigate how their lexical ``anatomy'' and domain characteristics affect cross-domain generalization. We combine HurtLex-based lexical profiling with cross-dataset evaluation in a feature-based transfer-learning setup, using BERTimbau embeddings and an XGBoost classifier. Our analysis shows that, although the datasets share a broadly similar macro-level focus, they diverge substantially in how specific terms are used and labeled across platforms and topics. Results indicate that lexical breadth and annotation practices strongly predict transferability: datasets with broader and more heterogeneous toxic vocabulary yield better cross-domain performance, whereas resources with narrow, profanity-centered labeling lead to severe generalization gaps, even when lexical overlap is high. These findings underscore the impact of collection and labeling strategies on the curation and evaluation of Portuguese hate speech datasets. Warning! This work and the referenced datasets contain examples of offensive and hateful language."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moreira-etal-2026-anatomy">
<titleInfo>
<title>Anatomy of Data Repositories for the Analysis and Detection of Toxicity in Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lorena</namePart>
<namePart type="given">Souza</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paula</namePart>
<namePart type="given">Teresa</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Gibrim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Rocha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julio</namePart>
<namePart type="given">C</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Reis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>The proliferation of online hate speech requires a rigorous examination of the datasets used to train detection models. In this work, we analyze six Brazilian Portuguese datasets annotated for hate speech or toxicity to investigate how their lexical “anatomy” and domain characteristics affect cross-domain generalization. We combine HurtLex-based lexical profiling with cross-dataset evaluation in a feature-based transfer-learning setup, using BERTimbau embeddings and an XGBoost classifier. Our analysis shows that, although the datasets share a broadly similar macro-level focus, they diverge substantially in how specific terms are used and labeled across platforms and topics. Results indicate that lexical breadth and annotation practices strongly predict transferability: datasets with broader and more heterogeneous toxic vocabulary yield better cross-domain performance, whereas resources with narrow, profanity-centered labeling lead to severe generalization gaps, even when lexical overlap is high. These findings underscore the impact of collection and labeling strategies on the curation and evaluation of Portuguese hate speech datasets. Warning! This work and the referenced datasets contain examples of offensive and hateful language.</abstract>
<identifier type="citekey">moreira-etal-2026-anatomy</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.45/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>456</start>
<end>466</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Anatomy of Data Repositories for the Analysis and Detection of Toxicity in Portuguese
%A Moreira, Lorena Souza
%A Gibrim, Paula Teresa M.
%A Rocha, Leonardo
%A Reis, Julio C. S.
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F moreira-etal-2026-anatomy
%X The proliferation of online hate speech requires a rigorous examination of the datasets used to train detection models. In this work, we analyze six Brazilian Portuguese datasets annotated for hate speech or toxicity to investigate how their lexical “anatomy” and domain characteristics affect cross-domain generalization. We combine HurtLex-based lexical profiling with cross-dataset evaluation in a feature-based transfer-learning setup, using BERTimbau embeddings and an XGBoost classifier. Our analysis shows that, although the datasets share a broadly similar macro-level focus, they diverge substantially in how specific terms are used and labeled across platforms and topics. Results indicate that lexical breadth and annotation practices strongly predict transferability: datasets with broader and more heterogeneous toxic vocabulary yield better cross-domain performance, whereas resources with narrow, profanity-centered labeling lead to severe generalization gaps, even when lexical overlap is high. These findings underscore the impact of collection and labeling strategies on the curation and evaluation of Portuguese hate speech datasets. Warning! This work and the referenced datasets contain examples of offensive and hateful language.
%U https://aclanthology.org/2026.propor-1.45/
%P 456-466
Markdown (Informal)
[Anatomy of Data Repositories for the Analysis and Detection of Toxicity in Portuguese](https://aclanthology.org/2026.propor-1.45/) (Moreira et al., PROPOR 2026)
ACL