@inproceedings{mohammadi-etal-2025-boosting,
title = "Boosting Sentiment Analysis in {P}ersian through a {GAN}-Based Synthetic Data Augmentation Method",
author = "Mohammadi, Masoumeh and
Amin, Mohammad Ruhul and
Tavakoli, Shadi",
editor = "El-Haj, Mo",
booktitle = "Proceedings of the 1st Workshop on NLP for Languages Using Arabic Script",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.abjadnlp-1.7/",
pages = "54--63",
abstract = "This paper presents a novel Sentiment Analysis (SA) dataset in the low-resource Persian language including a data augmentation technique using Generative Adversarial Networks (GANs) to generate synthetic data, boosting the volume and variety of data, for achieving state-of-the-art performance. We propose a novel annotated SA dataset, called Senti-Persian, made of 67,743 public comments on movie reviews from Iranian websites (Namava, Filimo and Aparat) and social media (YouTube, Twitter and Instagram). These reviews are labeled with one of the polarity labels, namely positive, negative, and neutral. Our study includes a novel text augmentation model based on GANs. The generator was designed following the linguistic properties of Persian linguistics, while the discriminator was designed based on the cosine similarity of the vectorized original and generated sentences, i.e. using CLS-embedings of BERT. A SA task applied on both collected and augmented datasets for which we observed a significant improvement in the accuracy from 88.4{\%} for the original dataset to the 96{\%} when augmented with synthetic data. Senti-Parsian dataset including the original and the augmented ones will be available on github."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohammadi-etal-2025-boosting">
<titleInfo>
<title>Boosting Sentiment Analysis in Persian through a GAN-Based Synthetic Data Augmentation Method</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masoumeh</namePart>
<namePart type="family">Mohammadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Ruhul</namePart>
<namePart type="family">Amin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shadi</namePart>
<namePart type="family">Tavakoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mo</namePart>
<namePart type="family">El-Haj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents a novel Sentiment Analysis (SA) dataset in the low-resource Persian language including a data augmentation technique using Generative Adversarial Networks (GANs) to generate synthetic data, boosting the volume and variety of data, for achieving state-of-the-art performance. We propose a novel annotated SA dataset, called Senti-Persian, made of 67,743 public comments on movie reviews from Iranian websites (Namava, Filimo and Aparat) and social media (YouTube, Twitter and Instagram). These reviews are labeled with one of the polarity labels, namely positive, negative, and neutral. Our study includes a novel text augmentation model based on GANs. The generator was designed following the linguistic properties of Persian linguistics, while the discriminator was designed based on the cosine similarity of the vectorized original and generated sentences, i.e. using CLS-embedings of BERT. A SA task applied on both collected and augmented datasets for which we observed a significant improvement in the accuracy from 88.4% for the original dataset to the 96% when augmented with synthetic data. Senti-Parsian dataset including the original and the augmented ones will be available on github.</abstract>
<identifier type="citekey">mohammadi-etal-2025-boosting</identifier>
<location>
<url>https://aclanthology.org/2025.abjadnlp-1.7/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>54</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Boosting Sentiment Analysis in Persian through a GAN-Based Synthetic Data Augmentation Method
%A Mohammadi, Masoumeh
%A Amin, Mohammad Ruhul
%A Tavakoli, Shadi
%Y El-Haj, Mo
%S Proceedings of the 1st Workshop on NLP for Languages Using Arabic Script
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F mohammadi-etal-2025-boosting
%X This paper presents a novel Sentiment Analysis (SA) dataset in the low-resource Persian language including a data augmentation technique using Generative Adversarial Networks (GANs) to generate synthetic data, boosting the volume and variety of data, for achieving state-of-the-art performance. We propose a novel annotated SA dataset, called Senti-Persian, made of 67,743 public comments on movie reviews from Iranian websites (Namava, Filimo and Aparat) and social media (YouTube, Twitter and Instagram). These reviews are labeled with one of the polarity labels, namely positive, negative, and neutral. Our study includes a novel text augmentation model based on GANs. The generator was designed following the linguistic properties of Persian linguistics, while the discriminator was designed based on the cosine similarity of the vectorized original and generated sentences, i.e. using CLS-embedings of BERT. A SA task applied on both collected and augmented datasets for which we observed a significant improvement in the accuracy from 88.4% for the original dataset to the 96% when augmented with synthetic data. Senti-Parsian dataset including the original and the augmented ones will be available on github.
%U https://aclanthology.org/2025.abjadnlp-1.7/
%P 54-63
Markdown (Informal)
[Boosting Sentiment Analysis in Persian through a GAN-Based Synthetic Data Augmentation Method](https://aclanthology.org/2025.abjadnlp-1.7/) (Mohammadi et al., AbjadNLP 2025)
ACL