@inproceedings{suvarna-etal-2025-modelcitizens,
title = "{M}odel{C}itizens: Representing Community Voices in Online Safety",
author = "Suvarna, Ashima and
Chance, Christina A and
Naranjo, Karolina and
Palangi, Hamid and
Hao, Sophie and
Hartvigsen, Thomas and
Gabriel, Saadia",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1571/",
pages = "30838--30854",
ISBN = "979-8-89176-332-6",
abstract = "Automatic toxic language detection is important for creating safe, inclusive online spaces. However, it is a highly subjective task, with perceptions of toxic language shaped by community norms and lived experience. Existing toxicity detection models are typically trained on annotations that collapse diverse annotator perspectives into a single ground truth, erasing important context-specific notions of toxicity such as reclaimed language. To address this, we introduce MODELCITIZENS, a dataset of 6.8K social media posts and 40K toxicity annotations across diverse identity groups. To reflect the impact of conversational context on toxicity, typical of social media posts, we augment MODELCITIZENS posts with LLM-generated conversational scenarios. State-of-the-art toxicity detection tools (e.g. OpenAI Moderation API, GPT-o4-mini) underperform on MODELCITIZENS with further degradation on context-augmented posts. Finally, we release LLAMACITIZEN-8B and GEMMACITIZEN-12B, LLaMA and Gemma-based models finetuned on our dataset, which outperform GPT-o4-mini by 5.5{\%} on in-distribution evaluations. Our findings highlight the importance of community-informed annotation and modeling for inclusive content moderation. We will release all code, data and models upon publication."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="suvarna-etal-2025-modelcitizens">
<titleInfo>
<title>ModelCitizens: Representing Community Voices in Online Safety</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ashima</namePart>
<namePart type="family">Suvarna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christina</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Chance</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karolina</namePart>
<namePart type="family">Naranjo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamid</namePart>
<namePart type="family">Palangi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophie</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Hartvigsen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saadia</namePart>
<namePart type="family">Gabriel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Automatic toxic language detection is important for creating safe, inclusive online spaces. However, it is a highly subjective task, with perceptions of toxic language shaped by community norms and lived experience. Existing toxicity detection models are typically trained on annotations that collapse diverse annotator perspectives into a single ground truth, erasing important context-specific notions of toxicity such as reclaimed language. To address this, we introduce MODELCITIZENS, a dataset of 6.8K social media posts and 40K toxicity annotations across diverse identity groups. To reflect the impact of conversational context on toxicity, typical of social media posts, we augment MODELCITIZENS posts with LLM-generated conversational scenarios. State-of-the-art toxicity detection tools (e.g. OpenAI Moderation API, GPT-o4-mini) underperform on MODELCITIZENS with further degradation on context-augmented posts. Finally, we release LLAMACITIZEN-8B and GEMMACITIZEN-12B, LLaMA and Gemma-based models finetuned on our dataset, which outperform GPT-o4-mini by 5.5% on in-distribution evaluations. Our findings highlight the importance of community-informed annotation and modeling for inclusive content moderation. We will release all code, data and models upon publication.</abstract>
<identifier type="citekey">suvarna-etal-2025-modelcitizens</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1571/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>30838</start>
<end>30854</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ModelCitizens: Representing Community Voices in Online Safety
%A Suvarna, Ashima
%A Chance, Christina A.
%A Naranjo, Karolina
%A Palangi, Hamid
%A Hao, Sophie
%A Hartvigsen, Thomas
%A Gabriel, Saadia
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F suvarna-etal-2025-modelcitizens
%X Automatic toxic language detection is important for creating safe, inclusive online spaces. However, it is a highly subjective task, with perceptions of toxic language shaped by community norms and lived experience. Existing toxicity detection models are typically trained on annotations that collapse diverse annotator perspectives into a single ground truth, erasing important context-specific notions of toxicity such as reclaimed language. To address this, we introduce MODELCITIZENS, a dataset of 6.8K social media posts and 40K toxicity annotations across diverse identity groups. To reflect the impact of conversational context on toxicity, typical of social media posts, we augment MODELCITIZENS posts with LLM-generated conversational scenarios. State-of-the-art toxicity detection tools (e.g. OpenAI Moderation API, GPT-o4-mini) underperform on MODELCITIZENS with further degradation on context-augmented posts. Finally, we release LLAMACITIZEN-8B and GEMMACITIZEN-12B, LLaMA and Gemma-based models finetuned on our dataset, which outperform GPT-o4-mini by 5.5% on in-distribution evaluations. Our findings highlight the importance of community-informed annotation and modeling for inclusive content moderation. We will release all code, data and models upon publication.
%U https://aclanthology.org/2025.emnlp-main.1571/
%P 30838-30854
Markdown (Informal)
[ModelCitizens: Representing Community Voices in Online Safety](https://aclanthology.org/2025.emnlp-main.1571/) (Suvarna et al., EMNLP 2025)
ACL
- Ashima Suvarna, Christina A Chance, Karolina Naranjo, Hamid Palangi, Sophie Hao, Thomas Hartvigsen, and Saadia Gabriel. 2025. ModelCitizens: Representing Community Voices in Online Safety. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 30838–30854, Suzhou, China. Association for Computational Linguistics.