@inproceedings{vajrobol-2022-coli,
title = "{C}o{LI}-Kanglish: Word-Level Language Identification in Code-Mixed {K}annada-{E}nglish Texts Shared Task using the Distilka model",
author = "Vajrobol, Vajratiya",
editor = "Chakravarthi, Bharathi Raja and
Murugappan, Abirami and
Chinnappa, Dhivya and
Hane, Adeep and
Kumeresan, Prasanna Kumar and
Ponnusamy, Rahul",
booktitle = "Proceedings of the 19th International Conference on Natural Language Processing (ICON): Shared Task on Word Level Language Identification in Code-mixed Kannada-English Texts",
month = dec,
year = "2022",
address = "IIIT Delhi, New Delhi, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.icon-wlli.2",
pages = "7--11",
abstract = "Due to the intercultural demographic of online users, code-mixed language is often used by them to express themselves on social media. Language support to such users is based on the ability of a system to identify the constituent languages of the code-mixed language. Therefore, the process of language identification that helps in determining the language of individual textual entities from a code-mixed corpus is a current and relevant classification problem. Code-mixed texts are difficult to interpret and analyze from an algorithmic perspective. However, highly complex transformer- based techniques can be used to analyze and identify distinct languages of words in code-mixed texts. Kannada is one of the Dravidian languages which is spoken and written in Karnataka, India. This study aims to identify the language of individual words of texts from a corpus of code-mixed Kannada-English texts using transformer-based techniques. The proposed Distilka model was developed by fine-tuning the DistilBERT model using the code-mixed corpus. This model performed best on the official test dataset with a macro-averaged F1-score of 0.62 and weighted precision score of 0.86. The proposed solution ranked first in the shared task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vajrobol-2022-coli">
<titleInfo>
<title>CoLI-Kanglish: Word-Level Language Identification in Code-Mixed Kannada-English Texts Shared Task using the Distilka model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vajratiya</namePart>
<namePart type="family">Vajrobol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Natural Language Processing (ICON): Shared Task on Word Level Language Identification in Code-mixed Kannada-English Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abirami</namePart>
<namePart type="family">Murugappan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhivya</namePart>
<namePart type="family">Chinnappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adeep</namePart>
<namePart type="family">Hane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasanna</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Kumeresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Ponnusamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">IIIT Delhi, New Delhi, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Due to the intercultural demographic of online users, code-mixed language is often used by them to express themselves on social media. Language support to such users is based on the ability of a system to identify the constituent languages of the code-mixed language. Therefore, the process of language identification that helps in determining the language of individual textual entities from a code-mixed corpus is a current and relevant classification problem. Code-mixed texts are difficult to interpret and analyze from an algorithmic perspective. However, highly complex transformer- based techniques can be used to analyze and identify distinct languages of words in code-mixed texts. Kannada is one of the Dravidian languages which is spoken and written in Karnataka, India. This study aims to identify the language of individual words of texts from a corpus of code-mixed Kannada-English texts using transformer-based techniques. The proposed Distilka model was developed by fine-tuning the DistilBERT model using the code-mixed corpus. This model performed best on the official test dataset with a macro-averaged F1-score of 0.62 and weighted precision score of 0.86. The proposed solution ranked first in the shared task.</abstract>
<identifier type="citekey">vajrobol-2022-coli</identifier>
<location>
<url>https://aclanthology.org/2022.icon-wlli.2</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>7</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CoLI-Kanglish: Word-Level Language Identification in Code-Mixed Kannada-English Texts Shared Task using the Distilka model
%A Vajrobol, Vajratiya
%Y Chakravarthi, Bharathi Raja
%Y Murugappan, Abirami
%Y Chinnappa, Dhivya
%Y Hane, Adeep
%Y Kumeresan, Prasanna Kumar
%Y Ponnusamy, Rahul
%S Proceedings of the 19th International Conference on Natural Language Processing (ICON): Shared Task on Word Level Language Identification in Code-mixed Kannada-English Texts
%D 2022
%8 December
%I Association for Computational Linguistics
%C IIIT Delhi, New Delhi, India
%F vajrobol-2022-coli
%X Due to the intercultural demographic of online users, code-mixed language is often used by them to express themselves on social media. Language support to such users is based on the ability of a system to identify the constituent languages of the code-mixed language. Therefore, the process of language identification that helps in determining the language of individual textual entities from a code-mixed corpus is a current and relevant classification problem. Code-mixed texts are difficult to interpret and analyze from an algorithmic perspective. However, highly complex transformer- based techniques can be used to analyze and identify distinct languages of words in code-mixed texts. Kannada is one of the Dravidian languages which is spoken and written in Karnataka, India. This study aims to identify the language of individual words of texts from a corpus of code-mixed Kannada-English texts using transformer-based techniques. The proposed Distilka model was developed by fine-tuning the DistilBERT model using the code-mixed corpus. This model performed best on the official test dataset with a macro-averaged F1-score of 0.62 and weighted precision score of 0.86. The proposed solution ranked first in the shared task.
%U https://aclanthology.org/2022.icon-wlli.2
%P 7-11
Markdown (Informal)
[CoLI-Kanglish: Word-Level Language Identification in Code-Mixed Kannada-English Texts Shared Task using the Distilka model](https://aclanthology.org/2022.icon-wlli.2) (Vajrobol, ICON 2022)
ACL