@inproceedings{sai-sharma-2021-towards,
title = "Towards Offensive Language Identification for {D}ravidian Languages",
author = "Sai, Siva and
Sharma, Yashvardhan",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Kumar M, Anand and
Krishnamurthy, Parameswari and
Sherly, Elizabeth",
booktitle = "Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages",
month = apr,
year = "2021",
address = "Kyiv",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.dravidianlangtech-1.3",
pages = "18--27",
abstract = "Offensive speech identification in countries like India poses several challenges due to the usage of code-mixed and romanized variants of multiple languages by the users in their posts on social media. The challenge of offensive language identification on social media for Dravidian languages is harder, considering the low resources available for the same. In this paper, we explored the zero-shot learning and few-shot learning paradigms based on multilingual language models for offensive speech detection in code-mixed and romanized variants of three Dravidian languages - Malayalam, Tamil, and Kannada. We propose a novel and flexible approach of selective translation and transliteration to reap better results from fine-tuning and ensembling multilingual transformer networks like XLMRoBERTa and mBERT. We implemented pretrained, fine-tuned, and ensembled versions of XLM-RoBERTa for offensive speech classification. Further, we experimented with interlanguage, inter-task, and multi-task transfer learning techniques to leverage the rich resources available for offensive speech identification in the English language and to enrich the models with knowledge transfer from related tasks. The proposed models yielded good results and are promising for effective offensive speech identification in low resource settings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sai-sharma-2021-towards">
<titleInfo>
<title>Towards Offensive Language Identification for Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siva</namePart>
<namePart type="family">Sai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yashvardhan</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruba</namePart>
<namePart type="family">Priyadharshini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anand</namePart>
<namePart type="family">Kumar M</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parameswari</namePart>
<namePart type="family">Krishnamurthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Sherly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyiv</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Offensive speech identification in countries like India poses several challenges due to the usage of code-mixed and romanized variants of multiple languages by the users in their posts on social media. The challenge of offensive language identification on social media for Dravidian languages is harder, considering the low resources available for the same. In this paper, we explored the zero-shot learning and few-shot learning paradigms based on multilingual language models for offensive speech detection in code-mixed and romanized variants of three Dravidian languages - Malayalam, Tamil, and Kannada. We propose a novel and flexible approach of selective translation and transliteration to reap better results from fine-tuning and ensembling multilingual transformer networks like XLMRoBERTa and mBERT. We implemented pretrained, fine-tuned, and ensembled versions of XLM-RoBERTa for offensive speech classification. Further, we experimented with interlanguage, inter-task, and multi-task transfer learning techniques to leverage the rich resources available for offensive speech identification in the English language and to enrich the models with knowledge transfer from related tasks. The proposed models yielded good results and are promising for effective offensive speech identification in low resource settings.</abstract>
<identifier type="citekey">sai-sharma-2021-towards</identifier>
<location>
<url>https://aclanthology.org/2021.dravidianlangtech-1.3</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>18</start>
<end>27</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Offensive Language Identification for Dravidian Languages
%A Sai, Siva
%A Sharma, Yashvardhan
%Y Chakravarthi, Bharathi Raja
%Y Priyadharshini, Ruba
%Y Kumar M, Anand
%Y Krishnamurthy, Parameswari
%Y Sherly, Elizabeth
%S Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kyiv
%F sai-sharma-2021-towards
%X Offensive speech identification in countries like India poses several challenges due to the usage of code-mixed and romanized variants of multiple languages by the users in their posts on social media. The challenge of offensive language identification on social media for Dravidian languages is harder, considering the low resources available for the same. In this paper, we explored the zero-shot learning and few-shot learning paradigms based on multilingual language models for offensive speech detection in code-mixed and romanized variants of three Dravidian languages - Malayalam, Tamil, and Kannada. We propose a novel and flexible approach of selective translation and transliteration to reap better results from fine-tuning and ensembling multilingual transformer networks like XLMRoBERTa and mBERT. We implemented pretrained, fine-tuned, and ensembled versions of XLM-RoBERTa for offensive speech classification. Further, we experimented with interlanguage, inter-task, and multi-task transfer learning techniques to leverage the rich resources available for offensive speech identification in the English language and to enrich the models with knowledge transfer from related tasks. The proposed models yielded good results and are promising for effective offensive speech identification in low resource settings.
%U https://aclanthology.org/2021.dravidianlangtech-1.3
%P 18-27
Markdown (Informal)
[Towards Offensive Language Identification for Dravidian Languages](https://aclanthology.org/2021.dravidianlangtech-1.3) (Sai & Sharma, DravidianLangTech 2021)
ACL