@inproceedings{zeng-2024-leveraging,
title = "Leveraging Large Language Models for Code-Mixed Data Augmentation in Sentiment Analysis",
author = "Zeng, Linda",
editor = "Hale, James and
Chawla, Kushal and
Garg, Muskan",
booktitle = "Proceedings of the Second Workshop on Social Influence in Conversations (SICon 2024)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sicon-1.6",
pages = "85--101",
abstract = "Code-mixing (CM), where speakers blend languages within a single expression, is prevalent in multilingual societies but poses challenges for natural language processing due to its complexity and limited data. We propose using a large language model to generate synthetic CM data, which is then used to enhance the performance of task-specific models for CM sentiment analysis. Our results show that in Spanish-English, synthetic data improved the F1 score by 9.32{\%}, outperforming previous augmentation techniques. However, in Malayalam-English, synthetic data only helped when the baseline was low; with strong natural data, additional synthetic data offered little benefit. Human evaluation confirmed that this approach is a simple, cost-effective way to generate natural-sounding CM sentences, particularly beneficial for low baselines. Our findings suggest that few-shot prompting of large language models is a promising method for CM data augmentation and has significant impact on improving sentiment analysis, an important element in the development of social influence systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zeng-2024-leveraging">
<titleInfo>
<title>Leveraging Large Language Models for Code-Mixed Data Augmentation in Sentiment Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Linda</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Social Influence in Conversations (SICon 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Hale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kushal</namePart>
<namePart type="family">Chawla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muskan</namePart>
<namePart type="family">Garg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code-mixing (CM), where speakers blend languages within a single expression, is prevalent in multilingual societies but poses challenges for natural language processing due to its complexity and limited data. We propose using a large language model to generate synthetic CM data, which is then used to enhance the performance of task-specific models for CM sentiment analysis. Our results show that in Spanish-English, synthetic data improved the F1 score by 9.32%, outperforming previous augmentation techniques. However, in Malayalam-English, synthetic data only helped when the baseline was low; with strong natural data, additional synthetic data offered little benefit. Human evaluation confirmed that this approach is a simple, cost-effective way to generate natural-sounding CM sentences, particularly beneficial for low baselines. Our findings suggest that few-shot prompting of large language models is a promising method for CM data augmentation and has significant impact on improving sentiment analysis, an important element in the development of social influence systems.</abstract>
<identifier type="citekey">zeng-2024-leveraging</identifier>
<location>
<url>https://aclanthology.org/2024.sicon-1.6</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>85</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Leveraging Large Language Models for Code-Mixed Data Augmentation in Sentiment Analysis
%A Zeng, Linda
%Y Hale, James
%Y Chawla, Kushal
%Y Garg, Muskan
%S Proceedings of the Second Workshop on Social Influence in Conversations (SICon 2024)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F zeng-2024-leveraging
%X Code-mixing (CM), where speakers blend languages within a single expression, is prevalent in multilingual societies but poses challenges for natural language processing due to its complexity and limited data. We propose using a large language model to generate synthetic CM data, which is then used to enhance the performance of task-specific models for CM sentiment analysis. Our results show that in Spanish-English, synthetic data improved the F1 score by 9.32%, outperforming previous augmentation techniques. However, in Malayalam-English, synthetic data only helped when the baseline was low; with strong natural data, additional synthetic data offered little benefit. Human evaluation confirmed that this approach is a simple, cost-effective way to generate natural-sounding CM sentences, particularly beneficial for low baselines. Our findings suggest that few-shot prompting of large language models is a promising method for CM data augmentation and has significant impact on improving sentiment analysis, an important element in the development of social influence systems.
%U https://aclanthology.org/2024.sicon-1.6
%P 85-101
Markdown (Informal)
[Leveraging Large Language Models for Code-Mixed Data Augmentation in Sentiment Analysis](https://aclanthology.org/2024.sicon-1.6) (Zeng, SICon 2024)
ACL