@inproceedings{sazzed-2021-abusive,
title = "Abusive content detection in transliterated {B}engali-{E}nglish social media corpus",
author = "Sazzed, Salim",
editor = "Solorio, Thamar and
Chen, Shuguang and
Black, Alan W. and
Diab, Mona and
Sitaram, Sunayana and
Soto, Victor and
Yilmaz, Emre and
Srinivasan, Anirudh",
booktitle = "Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.calcs-1.16",
doi = "10.18653/v1/2021.calcs-1.16",
pages = "125--130",
abstract = "Abusive text detection in low-resource languages such as Bengali is a challenging task due to the inadequacy of resources and tools. The ubiquity of transliterated Bengali comments in social media makes the task even more involved as monolingual approaches cannot capture them. Unfortunately, no transliterated Bengali corpus is publicly available yet for abusive content analysis. Therefore, in this paper, we introduce an annotated Bengali corpus of 3000 transliterated Bengali comments categorized into two classes, abusive and non-abusive, 1500 comments for each. For baseline evaluations, we employ several supervised machine learning (ML) and deep learning-based classifiers. We find support vector machine (SVM) shows the highest efficacy for identifying abusive content. We make the annotated corpus freely available for the researcher to aid abusive content detection in Bengali social media data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sazzed-2021-abusive">
<titleInfo>
<title>Abusive content detection in transliterated Bengali-English social media corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Salim</namePart>
<namePart type="family">Sazzed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thamar</namePart>
<namePart type="family">Solorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuguang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="given">W</namePart>
<namePart type="family">Black</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mona</namePart>
<namePart type="family">Diab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Soto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emre</namePart>
<namePart type="family">Yilmaz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anirudh</namePart>
<namePart type="family">Srinivasan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Abusive text detection in low-resource languages such as Bengali is a challenging task due to the inadequacy of resources and tools. The ubiquity of transliterated Bengali comments in social media makes the task even more involved as monolingual approaches cannot capture them. Unfortunately, no transliterated Bengali corpus is publicly available yet for abusive content analysis. Therefore, in this paper, we introduce an annotated Bengali corpus of 3000 transliterated Bengali comments categorized into two classes, abusive and non-abusive, 1500 comments for each. For baseline evaluations, we employ several supervised machine learning (ML) and deep learning-based classifiers. We find support vector machine (SVM) shows the highest efficacy for identifying abusive content. We make the annotated corpus freely available for the researcher to aid abusive content detection in Bengali social media data.</abstract>
<identifier type="citekey">sazzed-2021-abusive</identifier>
<identifier type="doi">10.18653/v1/2021.calcs-1.16</identifier>
<location>
<url>https://aclanthology.org/2021.calcs-1.16</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>125</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Abusive content detection in transliterated Bengali-English social media corpus
%A Sazzed, Salim
%Y Solorio, Thamar
%Y Chen, Shuguang
%Y Black, Alan W.
%Y Diab, Mona
%Y Sitaram, Sunayana
%Y Soto, Victor
%Y Yilmaz, Emre
%Y Srinivasan, Anirudh
%S Proceedings of the Fifth Workshop on Computational Approaches to Linguistic Code-Switching
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F sazzed-2021-abusive
%X Abusive text detection in low-resource languages such as Bengali is a challenging task due to the inadequacy of resources and tools. The ubiquity of transliterated Bengali comments in social media makes the task even more involved as monolingual approaches cannot capture them. Unfortunately, no transliterated Bengali corpus is publicly available yet for abusive content analysis. Therefore, in this paper, we introduce an annotated Bengali corpus of 3000 transliterated Bengali comments categorized into two classes, abusive and non-abusive, 1500 comments for each. For baseline evaluations, we employ several supervised machine learning (ML) and deep learning-based classifiers. We find support vector machine (SVM) shows the highest efficacy for identifying abusive content. We make the annotated corpus freely available for the researcher to aid abusive content detection in Bengali social media data.
%R 10.18653/v1/2021.calcs-1.16
%U https://aclanthology.org/2021.calcs-1.16
%U https://doi.org/10.18653/v1/2021.calcs-1.16
%P 125-130
Markdown (Informal)
[Abusive content detection in transliterated Bengali-English social media corpus](https://aclanthology.org/2021.calcs-1.16) (Sazzed, CALCS 2021)
ACL