@inproceedings{patil-etal-2022-l3cube,
title = "{L}3{C}ube-{M}aha{H}ate: A Tweet-based {M}arathi Hate Speech Detection Dataset and {BERT} Models",
author = "Patil, Hrushikesh and
Velankar, Abhishek and
Joshi, Raviraj",
editor = "Kumar, Ritesh and
Ojha, Atul Kr. and
Zampieri, Marcos and
Malmasi, Shervin and
Kadar, Daniel",
booktitle = "Proceedings of the Third Workshop on Threat, Aggression and Cyberbullying (TRAC 2022)",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.trac-1.1/",
pages = "1--9",
abstract = "Social media platforms are used by a large number of people prominently to express their thoughts and opinions. However, these platforms have contributed to a sub stantial amount of hateful and abusive content as well. Therefore, it is impor tant to curb the spread of hate speech on these platforms. In India, Marathi is one of the most popular languages used by a wide audience. In this work, we present L3Cube-MahaHate, the first ma jor Hate Speech Dataset in Marathi. The dataset is curated from Twitter, anno tated manually. Our dataset consists of over 00 distinct tweets labeled into four major classes i.e hate, offensive, pro fane, and not. We present the approaches used for collecting and annotating the data and the challenges faced during the pro cess. Finally, we present baseline classi fication results using deep learning mod els based on CNN, LSTM, and Transform ers. We explore mono-lingual and multi lingual variants of BERT like MahaBERT, IndicBERT, mBERT, and xlm-RoBERTa and show that mono-lingual models per form better than their multi-lingual coun terparts. The MahaBERT model provides the best results on L3Cube-MahaHate Corpus."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="patil-etal-2022-l3cube">
<titleInfo>
<title>L3Cube-MahaHate: A Tweet-based Marathi Hate Speech Detection Dataset and BERT Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hrushikesh</namePart>
<namePart type="family">Patil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhishek</namePart>
<namePart type="family">Velankar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raviraj</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Threat, Aggression and Cyberbullying (TRAC 2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shervin</namePart>
<namePart type="family">Malmasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Kadar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Social media platforms are used by a large number of people prominently to express their thoughts and opinions. However, these platforms have contributed to a sub stantial amount of hateful and abusive content as well. Therefore, it is impor tant to curb the spread of hate speech on these platforms. In India, Marathi is one of the most popular languages used by a wide audience. In this work, we present L3Cube-MahaHate, the first ma jor Hate Speech Dataset in Marathi. The dataset is curated from Twitter, anno tated manually. Our dataset consists of over 00 distinct tweets labeled into four major classes i.e hate, offensive, pro fane, and not. We present the approaches used for collecting and annotating the data and the challenges faced during the pro cess. Finally, we present baseline classi fication results using deep learning mod els based on CNN, LSTM, and Transform ers. We explore mono-lingual and multi lingual variants of BERT like MahaBERT, IndicBERT, mBERT, and xlm-RoBERTa and show that mono-lingual models per form better than their multi-lingual coun terparts. The MahaBERT model provides the best results on L3Cube-MahaHate Corpus.</abstract>
<identifier type="citekey">patil-etal-2022-l3cube</identifier>
<location>
<url>https://aclanthology.org/2022.trac-1.1/</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>1</start>
<end>9</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T L3Cube-MahaHate: A Tweet-based Marathi Hate Speech Detection Dataset and BERT Models
%A Patil, Hrushikesh
%A Velankar, Abhishek
%A Joshi, Raviraj
%Y Kumar, Ritesh
%Y Ojha, Atul Kr.
%Y Zampieri, Marcos
%Y Malmasi, Shervin
%Y Kadar, Daniel
%S Proceedings of the Third Workshop on Threat, Aggression and Cyberbullying (TRAC 2022)
%D 2022
%8 October
%I Association for Computational Linguistics
%C Gyeongju, Republic of Korea
%F patil-etal-2022-l3cube
%X Social media platforms are used by a large number of people prominently to express their thoughts and opinions. However, these platforms have contributed to a sub stantial amount of hateful and abusive content as well. Therefore, it is impor tant to curb the spread of hate speech on these platforms. In India, Marathi is one of the most popular languages used by a wide audience. In this work, we present L3Cube-MahaHate, the first ma jor Hate Speech Dataset in Marathi. The dataset is curated from Twitter, anno tated manually. Our dataset consists of over 00 distinct tweets labeled into four major classes i.e hate, offensive, pro fane, and not. We present the approaches used for collecting and annotating the data and the challenges faced during the pro cess. Finally, we present baseline classi fication results using deep learning mod els based on CNN, LSTM, and Transform ers. We explore mono-lingual and multi lingual variants of BERT like MahaBERT, IndicBERT, mBERT, and xlm-RoBERTa and show that mono-lingual models per form better than their multi-lingual coun terparts. The MahaBERT model provides the best results on L3Cube-MahaHate Corpus.
%U https://aclanthology.org/2022.trac-1.1/
%P 1-9
Markdown (Informal)
[L3Cube-MahaHate: A Tweet-based Marathi Hate Speech Detection Dataset and BERT Models](https://aclanthology.org/2022.trac-1.1/) (Patil et al., TRAC 2022)
ACL