@inproceedings{ruman-etal-2025-cuet,
title = "{CUET}{\_}{N}317@{LT}-{EDI}2025: Detecting Hate Speech Related to Caste and Migration with Transformer Models",
author = "Ruman, Md. Nur Siddik and
Chowdhury, Md. Tahfim Juwel and
Murad, Hasan",
editor = "Gkirtzou, Katerina and
{\v{Z}}itnik, Slavko and
Gracia, Jorge and
Gromann, Dagmar and
di Buono, Maria Pia and
Monti, Johanna and
Ionov, Maxim",
booktitle = "Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion",
month = sep,
year = "2025",
address = "Naples, Italy",
publisher = "Unior Press",
url = "https://aclanthology.org/2025.ltedi-1.18/",
pages = "105--110",
ISBN = "978-88-6719-334-9",
abstract = "Language that criticizes threatens, or discriminates against people or groups because of their caste, social rank, or status is known as caste and migration hate speech and it has grown in credibly common on social media. Such speech not only contributes to social disruption and in equity, but it also puts at risk the safety and mental health of the targeted groups. Due to the absence of labeled data, the subtlety of culturally unique insults, and the lack of strong linguistic resources for deep text recognition, it is especially difficult to detect caste and migration hate speech in low-resource Dravidian languages like Tamil. In this work, we address the Caste and Migration Hate Speech Detection task, aiming to automatically classify user-generated content as either hateful or non-hateful. We evaluate a range of approaches, including a traditional TF-IDF-based machine learning pipeline using SVM and Logistic Regression, alongside five transformer-based models: mBERT, XLM-R, MuRIL, Tamil BERT, and Tamilhate-BERT.Among these, the domain-adapted Tamilhate BERT achieved the highest macro-F1 score of 0.88 on the test data, securing 1st place in the Shared Task on Caste and Migration Hate Speech Detection at DravidianLangTech@LT-EDI 2025. Our findings highlight the strong performance of transformer models, particularly those fine-tuned on domain-specific data, in detecting nuanced hate speech in low-resource, code-mixed languages like Tamil."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ruman-etal-2025-cuet">
<titleInfo>
<title>CUET_N317@LT-EDI2025: Detecting Hate Speech Related to Caste and Migration with Transformer Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Nur</namePart>
<namePart type="given">Siddik</namePart>
<namePart type="family">Ruman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Tahfim</namePart>
<namePart type="given">Juwel</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hasan</namePart>
<namePart type="family">Murad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Gkirtzou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slavko</namePart>
<namePart type="family">Žitnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Gracia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dagmar</namePart>
<namePart type="family">Gromann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Pia</namePart>
<namePart type="family">di Buono</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Monti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Ionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Unior Press</publisher>
<place>
<placeTerm type="text">Naples, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-88-6719-334-9</identifier>
</relatedItem>
<abstract>Language that criticizes threatens, or discriminates against people or groups because of their caste, social rank, or status is known as caste and migration hate speech and it has grown in credibly common on social media. Such speech not only contributes to social disruption and in equity, but it also puts at risk the safety and mental health of the targeted groups. Due to the absence of labeled data, the subtlety of culturally unique insults, and the lack of strong linguistic resources for deep text recognition, it is especially difficult to detect caste and migration hate speech in low-resource Dravidian languages like Tamil. In this work, we address the Caste and Migration Hate Speech Detection task, aiming to automatically classify user-generated content as either hateful or non-hateful. We evaluate a range of approaches, including a traditional TF-IDF-based machine learning pipeline using SVM and Logistic Regression, alongside five transformer-based models: mBERT, XLM-R, MuRIL, Tamil BERT, and Tamilhate-BERT.Among these, the domain-adapted Tamilhate BERT achieved the highest macro-F1 score of 0.88 on the test data, securing 1st place in the Shared Task on Caste and Migration Hate Speech Detection at DravidianLangTech@LT-EDI 2025. Our findings highlight the strong performance of transformer models, particularly those fine-tuned on domain-specific data, in detecting nuanced hate speech in low-resource, code-mixed languages like Tamil.</abstract>
<identifier type="citekey">ruman-etal-2025-cuet</identifier>
<location>
<url>https://aclanthology.org/2025.ltedi-1.18/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>105</start>
<end>110</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CUET_N317@LT-EDI2025: Detecting Hate Speech Related to Caste and Migration with Transformer Models
%A Ruman, Md. Nur Siddik
%A Chowdhury, Md. Tahfim Juwel
%A Murad, Hasan
%Y Gkirtzou, Katerina
%Y Žitnik, Slavko
%Y Gracia, Jorge
%Y Gromann, Dagmar
%Y di Buono, Maria Pia
%Y Monti, Johanna
%Y Ionov, Maxim
%S Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion
%D 2025
%8 September
%I Unior Press
%C Naples, Italy
%@ 978-88-6719-334-9
%F ruman-etal-2025-cuet
%X Language that criticizes threatens, or discriminates against people or groups because of their caste, social rank, or status is known as caste and migration hate speech and it has grown in credibly common on social media. Such speech not only contributes to social disruption and in equity, but it also puts at risk the safety and mental health of the targeted groups. Due to the absence of labeled data, the subtlety of culturally unique insults, and the lack of strong linguistic resources for deep text recognition, it is especially difficult to detect caste and migration hate speech in low-resource Dravidian languages like Tamil. In this work, we address the Caste and Migration Hate Speech Detection task, aiming to automatically classify user-generated content as either hateful or non-hateful. We evaluate a range of approaches, including a traditional TF-IDF-based machine learning pipeline using SVM and Logistic Regression, alongside five transformer-based models: mBERT, XLM-R, MuRIL, Tamil BERT, and Tamilhate-BERT.Among these, the domain-adapted Tamilhate BERT achieved the highest macro-F1 score of 0.88 on the test data, securing 1st place in the Shared Task on Caste and Migration Hate Speech Detection at DravidianLangTech@LT-EDI 2025. Our findings highlight the strong performance of transformer models, particularly those fine-tuned on domain-specific data, in detecting nuanced hate speech in low-resource, code-mixed languages like Tamil.
%U https://aclanthology.org/2025.ltedi-1.18/
%P 105-110
Markdown (Informal)
[CUET_N317@LT-EDI2025: Detecting Hate Speech Related to Caste and Migration with Transformer Models](https://aclanthology.org/2025.ltedi-1.18/) (Ruman et al., LTEDI 2025)
ACL