@inproceedings{bansal-etal-2020-language,
title = "Language Identification and Normalization of Code Mixed {E}nglish and {P}unjabi Text",
author = "Bansal, Neetika and
Goyal, Dr. Vishal and
Rani, Dr. Simpel",
editor = "Goyal, Vishal and
Ekbal, Asif",
booktitle = "Proceedings of the 17th International Conference on Natural Language Processing (ICON): System Demonstrations",
month = dec,
year = "2020",
address = "Patna, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2020.icon-demos.12",
pages = "30--31",
abstract = "Code mixing is prevalent when users use two or more languages while communicating. It becomes more complex when users prefer romanized text to Unicode typing. The automatic processing of social media data has become one of popular areas of interest. Especially since COVID period the involvement of youngsters has attained heights. Walking with the pace our intended software deals with Language Identification and Normalization of English and Punjabi code mixed text. The software designed follows a pipeline which includes data collection, pre-processing, language identification, handling Out of Vocabulary words, normalization and transliteration of English- Punjabi text. After applying five-fold cross validation on the corpus, the accuracy of 96.8{\%} is achieved on a trained dataset of around 80025 tokens. After the prediction of the tags: the slangs, contractions in the user input are normalized to their standard form. In addition, the words with Punjabi as predicted tags are transliterated to Punjabi.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bansal-etal-2020-language">
<titleInfo>
<title>Language Identification and Normalization of Code Mixed English and Punjabi Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neetika</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr.</namePart>
<namePart type="given">Vishal</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dr.</namePart>
<namePart type="given">Simpel</namePart>
<namePart type="family">Rani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Natural Language Processing (ICON): System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vishal</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Patna, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code mixing is prevalent when users use two or more languages while communicating. It becomes more complex when users prefer romanized text to Unicode typing. The automatic processing of social media data has become one of popular areas of interest. Especially since COVID period the involvement of youngsters has attained heights. Walking with the pace our intended software deals with Language Identification and Normalization of English and Punjabi code mixed text. The software designed follows a pipeline which includes data collection, pre-processing, language identification, handling Out of Vocabulary words, normalization and transliteration of English- Punjabi text. After applying five-fold cross validation on the corpus, the accuracy of 96.8% is achieved on a trained dataset of around 80025 tokens. After the prediction of the tags: the slangs, contractions in the user input are normalized to their standard form. In addition, the words with Punjabi as predicted tags are transliterated to Punjabi.</abstract>
<identifier type="citekey">bansal-etal-2020-language</identifier>
<location>
<url>https://aclanthology.org/2020.icon-demos.12</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>30</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Identification and Normalization of Code Mixed English and Punjabi Text
%A Bansal, Neetika
%A Goyal, Dr. Vishal
%A Rani, Dr. Simpel
%Y Goyal, Vishal
%Y Ekbal, Asif
%S Proceedings of the 17th International Conference on Natural Language Processing (ICON): System Demonstrations
%D 2020
%8 December
%I NLP Association of India (NLPAI)
%C Patna, India
%F bansal-etal-2020-language
%X Code mixing is prevalent when users use two or more languages while communicating. It becomes more complex when users prefer romanized text to Unicode typing. The automatic processing of social media data has become one of popular areas of interest. Especially since COVID period the involvement of youngsters has attained heights. Walking with the pace our intended software deals with Language Identification and Normalization of English and Punjabi code mixed text. The software designed follows a pipeline which includes data collection, pre-processing, language identification, handling Out of Vocabulary words, normalization and transliteration of English- Punjabi text. After applying five-fold cross validation on the corpus, the accuracy of 96.8% is achieved on a trained dataset of around 80025 tokens. After the prediction of the tags: the slangs, contractions in the user input are normalized to their standard form. In addition, the words with Punjabi as predicted tags are transliterated to Punjabi.
%U https://aclanthology.org/2020.icon-demos.12
%P 30-31
Markdown (Informal)
[Language Identification and Normalization of Code Mixed English and Punjabi Text](https://aclanthology.org/2020.icon-demos.12) (Bansal et al., ICON 2020)
ACL