@inproceedings{kumar-etal-2024-mocktails,
title = "Mocktails of Translation, Ensemble Learning and Embeddings to tackle {H}inglish {NLP} challenges",
author = "Kumar, Lov and
Singh, Vikram and
Proksh and
Mishra, Pratyush",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.70/",
pages = "593--601",
abstract = "Social media has become a global platform where users express opinions on diverse contemporary topics, often blending dominant languages with native tongues, leading to code-mixed, context-rich content. A typical example is Hinglish, where Hindi elements are embedded in English texts. This linguistic mixture challenges traditional NLP systems, which rely on monolingual resources and need help to process multilingual content. Sentiment analysis for code-mixed data, mainly involving Indian languages, remains largely unexplored. This paper introduces a novel approach for sentiment analysis of code-mixed Hinglish data, combining translation, different stacking classifier architectures, and embedding techniques. We utilize pre-trained LoRA weights of a fine-tuned Gemma-2B model to translate Hinglish into English, followed by the employment of four pre-trained meta embeddings: GloVe-T, Word2Vec, TF-IDF, and fastText. SMOTE is applied to balance skewed data, and dimensionality reduction is performed before implementing machine learning models and stacking classifier ensembles. Three ensemble architectures, combining 22 base classifiers with a Logistic Regression meta-classifier, test different meta-embedding combinations. Experimental results show that the TF-W2V-FST (TF-IDF, Word2Vec, fastText) combination performs best, with SVM radial bias achieving the highest accuracy 91.53{\%} and AUC (0.96). This research contributes a novel and effective technique to sentiment analysis for code-mixed data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2024-mocktails">
<titleInfo>
<title>Mocktails of Translation, Ensemble Learning and Embeddings to tackle Hinglish NLP challenges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lov</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vikram</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Proksh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratyush</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Social media has become a global platform where users express opinions on diverse contemporary topics, often blending dominant languages with native tongues, leading to code-mixed, context-rich content. A typical example is Hinglish, where Hindi elements are embedded in English texts. This linguistic mixture challenges traditional NLP systems, which rely on monolingual resources and need help to process multilingual content. Sentiment analysis for code-mixed data, mainly involving Indian languages, remains largely unexplored. This paper introduces a novel approach for sentiment analysis of code-mixed Hinglish data, combining translation, different stacking classifier architectures, and embedding techniques. We utilize pre-trained LoRA weights of a fine-tuned Gemma-2B model to translate Hinglish into English, followed by the employment of four pre-trained meta embeddings: GloVe-T, Word2Vec, TF-IDF, and fastText. SMOTE is applied to balance skewed data, and dimensionality reduction is performed before implementing machine learning models and stacking classifier ensembles. Three ensemble architectures, combining 22 base classifiers with a Logistic Regression meta-classifier, test different meta-embedding combinations. Experimental results show that the TF-W2V-FST (TF-IDF, Word2Vec, fastText) combination performs best, with SVM radial bias achieving the highest accuracy 91.53% and AUC (0.96). This research contributes a novel and effective technique to sentiment analysis for code-mixed data.</abstract>
<identifier type="citekey">kumar-etal-2024-mocktails</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.70/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>593</start>
<end>601</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mocktails of Translation, Ensemble Learning and Embeddings to tackle Hinglish NLP challenges
%A Kumar, Lov
%A Singh, Vikram
%A Mishra, Pratyush
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%A Proksh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F kumar-etal-2024-mocktails
%X Social media has become a global platform where users express opinions on diverse contemporary topics, often blending dominant languages with native tongues, leading to code-mixed, context-rich content. A typical example is Hinglish, where Hindi elements are embedded in English texts. This linguistic mixture challenges traditional NLP systems, which rely on monolingual resources and need help to process multilingual content. Sentiment analysis for code-mixed data, mainly involving Indian languages, remains largely unexplored. This paper introduces a novel approach for sentiment analysis of code-mixed Hinglish data, combining translation, different stacking classifier architectures, and embedding techniques. We utilize pre-trained LoRA weights of a fine-tuned Gemma-2B model to translate Hinglish into English, followed by the employment of four pre-trained meta embeddings: GloVe-T, Word2Vec, TF-IDF, and fastText. SMOTE is applied to balance skewed data, and dimensionality reduction is performed before implementing machine learning models and stacking classifier ensembles. Three ensemble architectures, combining 22 base classifiers with a Logistic Regression meta-classifier, test different meta-embedding combinations. Experimental results show that the TF-W2V-FST (TF-IDF, Word2Vec, fastText) combination performs best, with SVM radial bias achieving the highest accuracy 91.53% and AUC (0.96). This research contributes a novel and effective technique to sentiment analysis for code-mixed data.
%U https://aclanthology.org/2024.icon-1.70/
%P 593-601
Markdown (Informal)
[Mocktails of Translation, Ensemble Learning and Embeddings to tackle Hinglish NLP challenges](https://aclanthology.org/2024.icon-1.70/) (Kumar et al., ICON 2024)
ACL