@inproceedings{e-ojo-etal-2022-language,
title = "Language Identification at the Word Level in Code-Mixed Texts Using Character Sequence and Word Embedding",
author = "E. Ojo, O. and
Gelbukh, A. and
Calvo, H. and
Feldman, A. and
O. Adebanji, O. and
Armenta-Segura, J.",
editor = "Chakravarthi, Bharathi Raja and
Murugappan, Abirami and
Chinnappa, Dhivya and
Hane, Adeep and
Kumeresan, Prasanna Kumar and
Ponnusamy, Rahul",
booktitle = "Proceedings of the 19th International Conference on Natural Language Processing (ICON): Shared Task on Word Level Language Identification in Code-mixed Kannada-English Texts",
month = dec,
year = "2022",
address = "IIIT Delhi, New Delhi, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.icon-wlli.1",
pages = "1--6",
abstract = "People often switch languages in conversations or written communication in order to communicate thoughts on social media platforms. The languages in texts of this type, also known as code-mixed texts, can be mixed at the sentence, word, or even sub-word level. In this paper, we address the problem of identifying language at the word level in code-mixed texts using a sequence of characters and word embedding. We feed machine learning and deep neural networks with a range of character-based and word-based text features as input. The data for this experiment was created by combining YouTube video comments from code-mixed Kannada and English (Kn-En) texts. The texts were pre-processed, split into words, and categorized as {`}Kannada{'}, {`}English{'}, {`}Mixed-Language{'}, {`}Name{'}, {`}Location{'}, and {`}Other{'}. The proposed techniques were able to learn from these features and were able to effectively identify the language of the words in the dataset. The proposed CK-Keras model with pre-trained Word2Vec embedding was our best-performing system, as it outperformed other methods when evaluated by the F1 scores.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="e-ojo-etal-2022-language">
<titleInfo>
<title>Language Identification at the Word Level in Code-Mixed Texts Using Character Sequence and Word Embedding</title>
</titleInfo>
<name type="personal">
<namePart type="given">O</namePart>
<namePart type="family">E. Ojo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="family">Gelbukh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="family">Calvo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">O</namePart>
<namePart type="family">O. Adebanji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">J</namePart>
<namePart type="family">Armenta-Segura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Natural Language Processing (ICON): Shared Task on Word Level Language Identification in Code-mixed Kannada-English Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abirami</namePart>
<namePart type="family">Murugappan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhivya</namePart>
<namePart type="family">Chinnappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adeep</namePart>
<namePart type="family">Hane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prasanna</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Kumeresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Ponnusamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">IIIT Delhi, New Delhi, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>People often switch languages in conversations or written communication in order to communicate thoughts on social media platforms. The languages in texts of this type, also known as code-mixed texts, can be mixed at the sentence, word, or even sub-word level. In this paper, we address the problem of identifying language at the word level in code-mixed texts using a sequence of characters and word embedding. We feed machine learning and deep neural networks with a range of character-based and word-based text features as input. The data for this experiment was created by combining YouTube video comments from code-mixed Kannada and English (Kn-En) texts. The texts were pre-processed, split into words, and categorized as ‘Kannada’, ‘English’, ‘Mixed-Language’, ‘Name’, ‘Location’, and ‘Other’. The proposed techniques were able to learn from these features and were able to effectively identify the language of the words in the dataset. The proposed CK-Keras model with pre-trained Word2Vec embedding was our best-performing system, as it outperformed other methods when evaluated by the F1 scores.</abstract>
<identifier type="citekey">e-ojo-etal-2022-language</identifier>
<location>
<url>https://aclanthology.org/2022.icon-wlli.1</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1</start>
<end>6</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Identification at the Word Level in Code-Mixed Texts Using Character Sequence and Word Embedding
%A E. Ojo, O.
%A Gelbukh, A.
%A Calvo, H.
%A Feldman, A.
%A O. Adebanji, O.
%A Armenta-Segura, J.
%Y Chakravarthi, Bharathi Raja
%Y Murugappan, Abirami
%Y Chinnappa, Dhivya
%Y Hane, Adeep
%Y Kumeresan, Prasanna Kumar
%Y Ponnusamy, Rahul
%S Proceedings of the 19th International Conference on Natural Language Processing (ICON): Shared Task on Word Level Language Identification in Code-mixed Kannada-English Texts
%D 2022
%8 December
%I Association for Computational Linguistics
%C IIIT Delhi, New Delhi, India
%F e-ojo-etal-2022-language
%X People often switch languages in conversations or written communication in order to communicate thoughts on social media platforms. The languages in texts of this type, also known as code-mixed texts, can be mixed at the sentence, word, or even sub-word level. In this paper, we address the problem of identifying language at the word level in code-mixed texts using a sequence of characters and word embedding. We feed machine learning and deep neural networks with a range of character-based and word-based text features as input. The data for this experiment was created by combining YouTube video comments from code-mixed Kannada and English (Kn-En) texts. The texts were pre-processed, split into words, and categorized as ‘Kannada’, ‘English’, ‘Mixed-Language’, ‘Name’, ‘Location’, and ‘Other’. The proposed techniques were able to learn from these features and were able to effectively identify the language of the words in the dataset. The proposed CK-Keras model with pre-trained Word2Vec embedding was our best-performing system, as it outperformed other methods when evaluated by the F1 scores.
%U https://aclanthology.org/2022.icon-wlli.1
%P 1-6
Markdown (Informal)
[Language Identification at the Word Level in Code-Mixed Texts Using Character Sequence and Word Embedding](https://aclanthology.org/2022.icon-wlli.1) (E. Ojo et al., ICON 2022)
ACL