@inproceedings{kapil-ekbal-2024-corpus,
title = "A Corpus of {H}indi-{E}nglish Code-Mixed Posts to Hate Speech Detection",
author = "Kapil, Prashant and
Ekbal, Asif",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.9/",
pages = "79--85",
abstract = "Social media content, such as blog posts, comments, and tweets, often contains offensive language, including racial hate speech, personal attacks, and sexual harassment. Detecting inappropriate language is crucial for user safety and prevention of hateful behavior and aggression. This study introduces the HECM (Hindi-English code-mixed tweets) to fill the gap in Hindi language resources. The corpus comprises approximately 9.4K tweets labeled as hateful and nonhateful. It includes detailed information on the data, such as the annotation schema, the label definitions, and an interannotator agreement score of 85{\%}. The study evaluates the effectiveness of traditional machine learning, deep neural networks, and transformer encoder-based approaches. The results show a significant improvement in terms of macro-F1 and weighted F1 scores. Additionally, a lexicon containing 2000 lexicons tagged in 21 categories is created based on the multilingual HURTLEX lexicon. This lexicon is merged with the transformer encoder, resulting in a marginal improvement in macro-F1 and weighted-F1. The study also experiments with a Hindi-Devanagari dataset to assess the impact of the lexicon on performance metrics."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kapil-ekbal-2024-corpus">
<titleInfo>
<title>A Corpus of Hindi-English Code-Mixed Posts to Hate Speech Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prashant</namePart>
<namePart type="family">Kapil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Social media content, such as blog posts, comments, and tweets, often contains offensive language, including racial hate speech, personal attacks, and sexual harassment. Detecting inappropriate language is crucial for user safety and prevention of hateful behavior and aggression. This study introduces the HECM (Hindi-English code-mixed tweets) to fill the gap in Hindi language resources. The corpus comprises approximately 9.4K tweets labeled as hateful and nonhateful. It includes detailed information on the data, such as the annotation schema, the label definitions, and an interannotator agreement score of 85%. The study evaluates the effectiveness of traditional machine learning, deep neural networks, and transformer encoder-based approaches. The results show a significant improvement in terms of macro-F1 and weighted F1 scores. Additionally, a lexicon containing 2000 lexicons tagged in 21 categories is created based on the multilingual HURTLEX lexicon. This lexicon is merged with the transformer encoder, resulting in a marginal improvement in macro-F1 and weighted-F1. The study also experiments with a Hindi-Devanagari dataset to assess the impact of the lexicon on performance metrics.</abstract>
<identifier type="citekey">kapil-ekbal-2024-corpus</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.9/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>79</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Corpus of Hindi-English Code-Mixed Posts to Hate Speech Detection
%A Kapil, Prashant
%A Ekbal, Asif
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F kapil-ekbal-2024-corpus
%X Social media content, such as blog posts, comments, and tweets, often contains offensive language, including racial hate speech, personal attacks, and sexual harassment. Detecting inappropriate language is crucial for user safety and prevention of hateful behavior and aggression. This study introduces the HECM (Hindi-English code-mixed tweets) to fill the gap in Hindi language resources. The corpus comprises approximately 9.4K tweets labeled as hateful and nonhateful. It includes detailed information on the data, such as the annotation schema, the label definitions, and an interannotator agreement score of 85%. The study evaluates the effectiveness of traditional machine learning, deep neural networks, and transformer encoder-based approaches. The results show a significant improvement in terms of macro-F1 and weighted F1 scores. Additionally, a lexicon containing 2000 lexicons tagged in 21 categories is created based on the multilingual HURTLEX lexicon. This lexicon is merged with the transformer encoder, resulting in a marginal improvement in macro-F1 and weighted-F1. The study also experiments with a Hindi-Devanagari dataset to assess the impact of the lexicon on performance metrics.
%U https://aclanthology.org/2024.icon-1.9/
%P 79-85
Markdown (Informal)
[A Corpus of Hindi-English Code-Mixed Posts to Hate Speech Detection](https://aclanthology.org/2024.icon-1.9/) (Kapil & Ekbal, ICON 2024)
ACL