@inproceedings{ibrahim-etal-2025-large,
title = "Large Language Models as Detectors or Instigators of Hate Speech in Low-resource {E}thiopian Languages",
author = "Ibrahim, Nuhu and
Mulford, Felicity and
Batista-Navarro, Riza",
editor = "Zhang, Chen and
Allaway, Emily and
Shen, Hua and
Miculicich, Lesly and
Li, Yinqiao and
M'hamdi, Meryem and
Limkonchotiwat, Peerat and
Bai, Richard He and
T.y.s.s., Santosh and
Han, Sophia Simeng and
Thapa, Surendrabikram and
Rim, Wiem Ben",
booktitle = "Proceedings of the 9th Widening NLP Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.winlp-main.31/",
pages = "197--209",
ISBN = "979-8-89176-351-7",
abstract = "We introduce a multilingual benchmark for evaluating large language models (LLMs) on hate speech detection and generation in low-resource Ethiopian languages: Afaan Oromo, Amharic and Tigrigna, and English (both monolingual and code-mixed). Using a balanced and expert-annotated dataset, we assess five state-of-the-art LLM families across both tasks. Our results show that while LLMs perform well on English detection, their performance on low-resource languages is significantly weaker, revealing that increasing model size alone does not ensure multilingual robustness. More critically, we find that all models, including closed and open-source variants, can be prompted to generate profiled hate speech with minimal resistance. These findings underscore the dual risk of exclusion and exploitation: LLMs fail to protect low-resource communities while enabling scalable harm against them. We make our evaluation framework available to facilitate future research on multilingual model safety and ethical robustness."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ibrahim-etal-2025-large">
<titleInfo>
<title>Large Language Models as Detectors or Instigators of Hate Speech in Low-resource Ethiopian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nuhu</namePart>
<namePart type="family">Ibrahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felicity</namePart>
<namePart type="family">Mulford</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Riza</namePart>
<namePart type="family">Batista-Navarro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Widening NLP Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Allaway</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lesly</namePart>
<namePart type="family">Miculicich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinqiao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meryem</namePart>
<namePart type="family">M’hamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peerat</namePart>
<namePart type="family">Limkonchotiwat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="given">He</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.y.s.s.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="given">Simeng</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wiem</namePart>
<namePart type="given">Ben</namePart>
<namePart type="family">Rim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-351-7</identifier>
</relatedItem>
<abstract>We introduce a multilingual benchmark for evaluating large language models (LLMs) on hate speech detection and generation in low-resource Ethiopian languages: Afaan Oromo, Amharic and Tigrigna, and English (both monolingual and code-mixed). Using a balanced and expert-annotated dataset, we assess five state-of-the-art LLM families across both tasks. Our results show that while LLMs perform well on English detection, their performance on low-resource languages is significantly weaker, revealing that increasing model size alone does not ensure multilingual robustness. More critically, we find that all models, including closed and open-source variants, can be prompted to generate profiled hate speech with minimal resistance. These findings underscore the dual risk of exclusion and exploitation: LLMs fail to protect low-resource communities while enabling scalable harm against them. We make our evaluation framework available to facilitate future research on multilingual model safety and ethical robustness.</abstract>
<identifier type="citekey">ibrahim-etal-2025-large</identifier>
<location>
<url>https://aclanthology.org/2025.winlp-main.31/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>197</start>
<end>209</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Language Models as Detectors or Instigators of Hate Speech in Low-resource Ethiopian Languages
%A Ibrahim, Nuhu
%A Mulford, Felicity
%A Batista-Navarro, Riza
%Y Zhang, Chen
%Y Allaway, Emily
%Y Shen, Hua
%Y Miculicich, Lesly
%Y Li, Yinqiao
%Y M’hamdi, Meryem
%Y Limkonchotiwat, Peerat
%Y Bai, Richard He
%Y T.y.s.s., Santosh
%Y Han, Sophia Simeng
%Y Thapa, Surendrabikram
%Y Rim, Wiem Ben
%S Proceedings of the 9th Widening NLP Workshop
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-351-7
%F ibrahim-etal-2025-large
%X We introduce a multilingual benchmark for evaluating large language models (LLMs) on hate speech detection and generation in low-resource Ethiopian languages: Afaan Oromo, Amharic and Tigrigna, and English (both monolingual and code-mixed). Using a balanced and expert-annotated dataset, we assess five state-of-the-art LLM families across both tasks. Our results show that while LLMs perform well on English detection, their performance on low-resource languages is significantly weaker, revealing that increasing model size alone does not ensure multilingual robustness. More critically, we find that all models, including closed and open-source variants, can be prompted to generate profiled hate speech with minimal resistance. These findings underscore the dual risk of exclusion and exploitation: LLMs fail to protect low-resource communities while enabling scalable harm against them. We make our evaluation framework available to facilitate future research on multilingual model safety and ethical robustness.
%U https://aclanthology.org/2025.winlp-main.31/
%P 197-209
Markdown (Informal)
[Large Language Models as Detectors or Instigators of Hate Speech in Low-resource Ethiopian Languages](https://aclanthology.org/2025.winlp-main.31/) (Ibrahim et al., WiNLP 2025)
ACL