@inproceedings{haider-etal-2025-banth,
title = "{B}an{TH}: A Multi-label Hate Speech Detection Dataset for Transliterated {B}angla",
author = "Haider, Fabiha and
Shifat, Fariha Tanjim and
Ishmam, Md Farhan and
Sourove, Md Sakib Ul Rahman and
Barua, Deeparghya Dutta and
Fahim, Md and
Bhuiyan, Md Farhad Alam",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.403/",
doi = "10.18653/v1/2025.findings-naacl.403",
pages = "7217--7236",
ISBN = "979-8-89176-195-7",
abstract = "The proliferation of transliterated texts in digital spaces has emphasized the need for detecting and classifying hate speech in languages beyond English, particularly in low-resource languages. As online discourse can perpetuate discrimination based on target groups, e.g. gender, religion, and origin, multi-label classification of hateful content can help in understanding hate motivation and enhance content moderation. While previous efforts have focused on monolingual or binary hate classification tasks, no work has yet addressed the challenge of multi-label hate speech classification in transliterated Bangla. We introduce BanTH, the first multi-label transliterated Bangla hate speech dataset. The samples are sourced from YouTube comments, where each instance is labeled with one or more target groups, reflecting the regional demographic. We propose a novel translation-based LLM prompting strategy that translates or transliterates under-resourced text to higher-resourced text before classifying the hate group(s). Experiments reveal further pre-trained encoders achieving state-of-the-art performance on the BanTH dataset while translation-based prompting outperforms other strategies in the zero-shot setting. We address a critical gap in Bangla hate speech and set the stage for further exploration into code-mixed and multi-label classification in underrepresented languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haider-etal-2025-banth">
<titleInfo>
<title>BanTH: A Multi-label Hate Speech Detection Dataset for Transliterated Bangla</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabiha</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fariha</namePart>
<namePart type="given">Tanjim</namePart>
<namePart type="family">Shifat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Farhan</namePart>
<namePart type="family">Ishmam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Sakib</namePart>
<namePart type="given">Ul</namePart>
<namePart type="given">Rahman</namePart>
<namePart type="family">Sourove</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deeparghya</namePart>
<namePart type="given">Dutta</namePart>
<namePart type="family">Barua</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="family">Fahim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md</namePart>
<namePart type="given">Farhad</namePart>
<namePart type="given">Alam</namePart>
<namePart type="family">Bhuiyan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>The proliferation of transliterated texts in digital spaces has emphasized the need for detecting and classifying hate speech in languages beyond English, particularly in low-resource languages. As online discourse can perpetuate discrimination based on target groups, e.g. gender, religion, and origin, multi-label classification of hateful content can help in understanding hate motivation and enhance content moderation. While previous efforts have focused on monolingual or binary hate classification tasks, no work has yet addressed the challenge of multi-label hate speech classification in transliterated Bangla. We introduce BanTH, the first multi-label transliterated Bangla hate speech dataset. The samples are sourced from YouTube comments, where each instance is labeled with one or more target groups, reflecting the regional demographic. We propose a novel translation-based LLM prompting strategy that translates or transliterates under-resourced text to higher-resourced text before classifying the hate group(s). Experiments reveal further pre-trained encoders achieving state-of-the-art performance on the BanTH dataset while translation-based prompting outperforms other strategies in the zero-shot setting. We address a critical gap in Bangla hate speech and set the stage for further exploration into code-mixed and multi-label classification in underrepresented languages.</abstract>
<identifier type="citekey">haider-etal-2025-banth</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.403</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.403/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>7217</start>
<end>7236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BanTH: A Multi-label Hate Speech Detection Dataset for Transliterated Bangla
%A Haider, Fabiha
%A Shifat, Fariha Tanjim
%A Ishmam, Md Farhan
%A Sourove, Md Sakib Ul Rahman
%A Barua, Deeparghya Dutta
%A Fahim, Md
%A Bhuiyan, Md Farhad Alam
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F haider-etal-2025-banth
%X The proliferation of transliterated texts in digital spaces has emphasized the need for detecting and classifying hate speech in languages beyond English, particularly in low-resource languages. As online discourse can perpetuate discrimination based on target groups, e.g. gender, religion, and origin, multi-label classification of hateful content can help in understanding hate motivation and enhance content moderation. While previous efforts have focused on monolingual or binary hate classification tasks, no work has yet addressed the challenge of multi-label hate speech classification in transliterated Bangla. We introduce BanTH, the first multi-label transliterated Bangla hate speech dataset. The samples are sourced from YouTube comments, where each instance is labeled with one or more target groups, reflecting the regional demographic. We propose a novel translation-based LLM prompting strategy that translates or transliterates under-resourced text to higher-resourced text before classifying the hate group(s). Experiments reveal further pre-trained encoders achieving state-of-the-art performance on the BanTH dataset while translation-based prompting outperforms other strategies in the zero-shot setting. We address a critical gap in Bangla hate speech and set the stage for further exploration into code-mixed and multi-label classification in underrepresented languages.
%R 10.18653/v1/2025.findings-naacl.403
%U https://aclanthology.org/2025.findings-naacl.403/
%U https://doi.org/10.18653/v1/2025.findings-naacl.403
%P 7217-7236
Markdown (Informal)
[BanTH: A Multi-label Hate Speech Detection Dataset for Transliterated Bangla](https://aclanthology.org/2025.findings-naacl.403/) (Haider et al., Findings 2025)
ACL
- Fabiha Haider, Fariha Tanjim Shifat, Md Farhan Ishmam, Md Sakib Ul Rahman Sourove, Deeparghya Dutta Barua, Md Fahim, and Md Farhad Alam Bhuiyan. 2025. BanTH: A Multi-label Hate Speech Detection Dataset for Transliterated Bangla. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 7217–7236, Albuquerque, New Mexico. Association for Computational Linguistics.