@inproceedings{gharbi-etal-2021-teet,
title = "{TEET}! {T}unisian Dataset for Toxic Speech Detection",
author = "Gharbi, Slim and
Haddad, Hatem and
Kchaou, Mayssa and
Arfaoui, Heger",
editor = "Varis, Erika and
Georgi, Ryan and
Tsai, Alicia and
Anastasopoulos, Antonios and
Chandu, Kyathi and
Schofield, Xanda and
Ranathunga, Surangika and
Lepp, Haley and
Ghosal, Tirthankar",
booktitle = "Proceedings of the Fifth Workshop on Widening Natural Language Processing",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.winlp-1.2/",
pages = "5--17",
abstract = "The complete freedom of expression in social media has its costs especially in spreading harmful and abusive content that may induce people to act accordingly. Therefore, the need of detecting automatically such a content becomes an urgent task that will help and enhance the efficiency in limiting this toxic spread. Compared to other Arabic dialects which are mostly based on MSA, the Tunisian dialect is a combination of many other languages like MSA, Tamazight, Italian and French. Because of its rich language, dealing with NLP problems can be challenging due to the lack of large annotated datasets. In our context of detecting hate and abusive speech for tunisian dialect, the only existing annotated dataset is T-HSAB combining 6,039 annotated comments as hateful, abusive or normal. In this paper we are introducing a larger annotated dataset composed of approximately 10k of comments. We provide an in-depth exploration of its vocabulary as well as the results of the classification performance of machine learning classifiers like NB and SVM and deep learning models such as ARBERT, MARBERT and XLM-R."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gharbi-etal-2021-teet">
<titleInfo>
<title>TEET! Tunisian Dataset for Toxic Speech Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Slim</namePart>
<namePart type="family">Gharbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatem</namePart>
<namePart type="family">Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mayssa</namePart>
<namePart type="family">Kchaou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heger</namePart>
<namePart type="family">Arfaoui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Widening Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erika</namePart>
<namePart type="family">Varis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Georgi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Tsai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyathi</namePart>
<namePart type="family">Chandu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xanda</namePart>
<namePart type="family">Schofield</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surangika</namePart>
<namePart type="family">Ranathunga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haley</namePart>
<namePart type="family">Lepp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The complete freedom of expression in social media has its costs especially in spreading harmful and abusive content that may induce people to act accordingly. Therefore, the need of detecting automatically such a content becomes an urgent task that will help and enhance the efficiency in limiting this toxic spread. Compared to other Arabic dialects which are mostly based on MSA, the Tunisian dialect is a combination of many other languages like MSA, Tamazight, Italian and French. Because of its rich language, dealing with NLP problems can be challenging due to the lack of large annotated datasets. In our context of detecting hate and abusive speech for tunisian dialect, the only existing annotated dataset is T-HSAB combining 6,039 annotated comments as hateful, abusive or normal. In this paper we are introducing a larger annotated dataset composed of approximately 10k of comments. We provide an in-depth exploration of its vocabulary as well as the results of the classification performance of machine learning classifiers like NB and SVM and deep learning models such as ARBERT, MARBERT and XLM-R.</abstract>
<identifier type="citekey">gharbi-etal-2021-teet</identifier>
<location>
<url>https://aclanthology.org/2021.winlp-1.2/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>5</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TEET! Tunisian Dataset for Toxic Speech Detection
%A Gharbi, Slim
%A Haddad, Hatem
%A Kchaou, Mayssa
%A Arfaoui, Heger
%Y Varis, Erika
%Y Georgi, Ryan
%Y Tsai, Alicia
%Y Anastasopoulos, Antonios
%Y Chandu, Kyathi
%Y Schofield, Xanda
%Y Ranathunga, Surangika
%Y Lepp, Haley
%Y Ghosal, Tirthankar
%S Proceedings of the Fifth Workshop on Widening Natural Language Processing
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F gharbi-etal-2021-teet
%X The complete freedom of expression in social media has its costs especially in spreading harmful and abusive content that may induce people to act accordingly. Therefore, the need of detecting automatically such a content becomes an urgent task that will help and enhance the efficiency in limiting this toxic spread. Compared to other Arabic dialects which are mostly based on MSA, the Tunisian dialect is a combination of many other languages like MSA, Tamazight, Italian and French. Because of its rich language, dealing with NLP problems can be challenging due to the lack of large annotated datasets. In our context of detecting hate and abusive speech for tunisian dialect, the only existing annotated dataset is T-HSAB combining 6,039 annotated comments as hateful, abusive or normal. In this paper we are introducing a larger annotated dataset composed of approximately 10k of comments. We provide an in-depth exploration of its vocabulary as well as the results of the classification performance of machine learning classifiers like NB and SVM and deep learning models such as ARBERT, MARBERT and XLM-R.
%U https://aclanthology.org/2021.winlp-1.2/
%P 5-17
Markdown (Informal)
[TEET! Tunisian Dataset for Toxic Speech Detection](https://aclanthology.org/2021.winlp-1.2/) (Gharbi et al., WiNLP 2021)
ACL
- Slim Gharbi, Hatem Haddad, Mayssa Kchaou, and Heger Arfaoui. 2021. TEET! Tunisian Dataset for Toxic Speech Detection. In Proceedings of the Fifth Workshop on Widening Natural Language Processing, pages 5–17, Punta Cana, Dominican Republic. Association for Computational Linguistics.