@inproceedings{achamaleh-etal-2025-em-26,
title = "{EM}-26@{LT}-{EDI} 2025: Caste and Migration Hate Speech Detection in {T}amil-{E}nglish Code-Mixed Social Media Texts",
author = "Achamaleh, Tewodros and
Abiola, Tolulope Olalekan and
Mebraihtu, Mikiyas and
Getachew, Sara and
Sidorov, Grigori",
editor = "Gkirtzou, Katerina and
{\v{Z}}itnik, Slavko and
Gracia, Jorge and
Gromann, Dagmar and
di Buono, Maria Pia and
Monti, Johanna and
Ionov, Maxim",
booktitle = "Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion",
month = sep,
year = "2025",
address = "Naples, Italy",
publisher = "Unior Press",
url = "https://aclanthology.org/2025.ltedi-1.26/",
pages = "153--159",
ISBN = "978-88-6719-334-9",
abstract = "In this paper, we describe the system developed by Team EM-26 for the Shared Task on Caste and Migration Hate Speech Detection at LTEDI@LDK 2025. The task addresses the challenge of recognizing caste-based and migration related hate speech in Tamil social media text, a language that is both nuanced and under resourced for machine learning. To tackle this, we fine-tuned the multilingual transformer XLM-RoBERTa-Large on the provided training data, leveraging its cross-lingual strengths to detect both explicit and implicit hate speech. To improve performance, we applied social media focused preprocessing techniques, including Tamil text normalization and noise removal. Our model achieved a macro F1-score of 0.6567 on the test set, highlighting the effectiveness of multilingual transformers for low resource hate speech detection. Additionally, we discuss key challenges and errors in Tamil hate speech classification, which may guide future work toward building more ethical and inclusive AI systems. The source code is available on GitHub.1"
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="achamaleh-etal-2025-em-26">
<titleInfo>
<title>EM-26@LT-EDI 2025: Caste and Migration Hate Speech Detection in Tamil-English Code-Mixed Social Media Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tewodros</namePart>
<namePart type="family">Achamaleh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tolulope</namePart>
<namePart type="given">Olalekan</namePart>
<namePart type="family">Abiola</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikiyas</namePart>
<namePart type="family">Mebraihtu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Getachew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grigori</namePart>
<namePart type="family">Sidorov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Gkirtzou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slavko</namePart>
<namePart type="family">Žitnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorge</namePart>
<namePart type="family">Gracia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dagmar</namePart>
<namePart type="family">Gromann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Pia</namePart>
<namePart type="family">di Buono</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johanna</namePart>
<namePart type="family">Monti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxim</namePart>
<namePart type="family">Ionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Unior Press</publisher>
<place>
<placeTerm type="text">Naples, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-88-6719-334-9</identifier>
</relatedItem>
<abstract>In this paper, we describe the system developed by Team EM-26 for the Shared Task on Caste and Migration Hate Speech Detection at LTEDI@LDK 2025. The task addresses the challenge of recognizing caste-based and migration related hate speech in Tamil social media text, a language that is both nuanced and under resourced for machine learning. To tackle this, we fine-tuned the multilingual transformer XLM-RoBERTa-Large on the provided training data, leveraging its cross-lingual strengths to detect both explicit and implicit hate speech. To improve performance, we applied social media focused preprocessing techniques, including Tamil text normalization and noise removal. Our model achieved a macro F1-score of 0.6567 on the test set, highlighting the effectiveness of multilingual transformers for low resource hate speech detection. Additionally, we discuss key challenges and errors in Tamil hate speech classification, which may guide future work toward building more ethical and inclusive AI systems. The source code is available on GitHub.1</abstract>
<identifier type="citekey">achamaleh-etal-2025-em-26</identifier>
<location>
<url>https://aclanthology.org/2025.ltedi-1.26/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>153</start>
<end>159</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EM-26@LT-EDI 2025: Caste and Migration Hate Speech Detection in Tamil-English Code-Mixed Social Media Texts
%A Achamaleh, Tewodros
%A Abiola, Tolulope Olalekan
%A Mebraihtu, Mikiyas
%A Getachew, Sara
%A Sidorov, Grigori
%Y Gkirtzou, Katerina
%Y Žitnik, Slavko
%Y Gracia, Jorge
%Y Gromann, Dagmar
%Y di Buono, Maria Pia
%Y Monti, Johanna
%Y Ionov, Maxim
%S Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion
%D 2025
%8 September
%I Unior Press
%C Naples, Italy
%@ 978-88-6719-334-9
%F achamaleh-etal-2025-em-26
%X In this paper, we describe the system developed by Team EM-26 for the Shared Task on Caste and Migration Hate Speech Detection at LTEDI@LDK 2025. The task addresses the challenge of recognizing caste-based and migration related hate speech in Tamil social media text, a language that is both nuanced and under resourced for machine learning. To tackle this, we fine-tuned the multilingual transformer XLM-RoBERTa-Large on the provided training data, leveraging its cross-lingual strengths to detect both explicit and implicit hate speech. To improve performance, we applied social media focused preprocessing techniques, including Tamil text normalization and noise removal. Our model achieved a macro F1-score of 0.6567 on the test set, highlighting the effectiveness of multilingual transformers for low resource hate speech detection. Additionally, we discuss key challenges and errors in Tamil hate speech classification, which may guide future work toward building more ethical and inclusive AI systems. The source code is available on GitHub.1
%U https://aclanthology.org/2025.ltedi-1.26/
%P 153-159
Markdown (Informal)
[EM-26@LT-EDI 2025: Caste and Migration Hate Speech Detection in Tamil-English Code-Mixed Social Media Texts](https://aclanthology.org/2025.ltedi-1.26/) (Achamaleh et al., LTEDI 2025)
ACL
- Tewodros Achamaleh, Tolulope Olalekan Abiola, Mikiyas Mebraihtu, Sara Getachew, and Grigori Sidorov. 2025. EM-26@LT-EDI 2025: Caste and Migration Hate Speech Detection in Tamil-English Code-Mixed Social Media Texts. In Proceedings of the 5th Conference on Language, Data and Knowledge: Fifth Workshop on Language Technology for Equality, Diversity, Inclusion, pages 153–159, Naples, Italy. Unior Press.