@inproceedings{ravikiran-annamalai-2021-dosa,
title = "{DOSA}: {D}ravidian Code-Mixed Offensive Span Identification Dataset",
author = "Ravikiran, Manikandan and
Annamalai, Subbiah",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Kumar M, Anand and
Krishnamurthy, Parameswari and
Sherly, Elizabeth",
booktitle = "Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages",
month = apr,
year = "2021",
address = "Kyiv",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.dravidianlangtech-1.2",
pages = "10--17",
abstract = "This paper presents the Dravidian Offensive Span Identification Dataset (DOSA) for under-resourced Tamil-English and Kannada-English code-mixed text. The dataset addresses the lack of code-mixed datasets with annotated offensive spans by extending annotations of existing code-mixed offensive language identification datasets. It provides span annotations for Tamil-English and Kannada-English code-mixed comments posted by users on YouTube social media. Overall the dataset consists of 4786 Tamil-English comments with 6202 annotated spans and 1097 Kannada-English comments with 1641 annotated spans, each annotated by two different annotators. We further present some of our baseline experimental results on the developed dataset, thereby eliciting research in under-resourced languages, leading to an essential step towards semi-automated content moderation in Dravidian languages. The dataset is available in \url{https://github.com/teamdl-mlsg/DOSA}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ravikiran-annamalai-2021-dosa">
<titleInfo>
<title>DOSA: Dravidian Code-Mixed Offensive Span Identification Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manikandan</namePart>
<namePart type="family">Ravikiran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subbiah</namePart>
<namePart type="family">Annamalai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruba</namePart>
<namePart type="family">Priyadharshini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anand</namePart>
<namePart type="family">Kumar M</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parameswari</namePart>
<namePart type="family">Krishnamurthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Sherly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyiv</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the Dravidian Offensive Span Identification Dataset (DOSA) for under-resourced Tamil-English and Kannada-English code-mixed text. The dataset addresses the lack of code-mixed datasets with annotated offensive spans by extending annotations of existing code-mixed offensive language identification datasets. It provides span annotations for Tamil-English and Kannada-English code-mixed comments posted by users on YouTube social media. Overall the dataset consists of 4786 Tamil-English comments with 6202 annotated spans and 1097 Kannada-English comments with 1641 annotated spans, each annotated by two different annotators. We further present some of our baseline experimental results on the developed dataset, thereby eliciting research in under-resourced languages, leading to an essential step towards semi-automated content moderation in Dravidian languages. The dataset is available in https://github.com/teamdl-mlsg/DOSA</abstract>
<identifier type="citekey">ravikiran-annamalai-2021-dosa</identifier>
<location>
<url>https://aclanthology.org/2021.dravidianlangtech-1.2</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>10</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DOSA: Dravidian Code-Mixed Offensive Span Identification Dataset
%A Ravikiran, Manikandan
%A Annamalai, Subbiah
%Y Chakravarthi, Bharathi Raja
%Y Priyadharshini, Ruba
%Y Kumar M, Anand
%Y Krishnamurthy, Parameswari
%Y Sherly, Elizabeth
%S Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kyiv
%F ravikiran-annamalai-2021-dosa
%X This paper presents the Dravidian Offensive Span Identification Dataset (DOSA) for under-resourced Tamil-English and Kannada-English code-mixed text. The dataset addresses the lack of code-mixed datasets with annotated offensive spans by extending annotations of existing code-mixed offensive language identification datasets. It provides span annotations for Tamil-English and Kannada-English code-mixed comments posted by users on YouTube social media. Overall the dataset consists of 4786 Tamil-English comments with 6202 annotated spans and 1097 Kannada-English comments with 1641 annotated spans, each annotated by two different annotators. We further present some of our baseline experimental results on the developed dataset, thereby eliciting research in under-resourced languages, leading to an essential step towards semi-automated content moderation in Dravidian languages. The dataset is available in https://github.com/teamdl-mlsg/DOSA
%U https://aclanthology.org/2021.dravidianlangtech-1.2
%P 10-17
Markdown (Informal)
[DOSA: Dravidian Code-Mixed Offensive Span Identification Dataset](https://aclanthology.org/2021.dravidianlangtech-1.2) (Ravikiran & Annamalai, DravidianLangTech 2021)
ACL