@inproceedings{chatterjee-etal-2022-pacman,
title = "{PACMAN}:{PA}rallel {C}ode{M}ixed d{A}ta generatio{N} for {POS} tagging",
author = "Chatterjee, Arindam and
Sharma, Chhavi and
Raj, Ayush and
Ekbal, Asif",
editor = "Akhtar, Md. Shad and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 19th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2022",
address = "New Delhi, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.icon-main.29",
pages = "234--244",
abstract = "Code-mixing or Code-switching is the mixing of languages in the same context, predominantly observed in multilingual societies. The existing code-mixed datasets are small and primarily contain social media text that does not adhere to standard spelling and grammar. Computational models built on such data fail to generalise on unseen code-mixed data. To address the unavailability of quality code-mixed annotated datasets, we explore the combined task of generating annotated code mixed data, and building computational models from this generated data, specifically for code-mixed Part-Of-Speech (POS) tagging. We introduce PACMAN(PArallel CodeMixed dAta generatioN) - a synthetically generated code-mixed POS tagged dataset, with above 50K samples, which is the largest annotated code-mixed dataset. We build POS taggers using classical machine learning and deep learning based techniques on the generated data to report an F1-score of 98{\%} (8{\%} above current State-of-the-art (SOTA)). To determine the efficacy of our data, we compare it against the existing benchmark in code-mixed POS tagging. PACMAN outperforms the benchmark, ratifying that our dataset and, subsequently, our POS tagging models are generalised and capable of handling even natural code-mixed and monolingual data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chatterjee-etal-2022-pacman">
<titleInfo>
<title>PACMAN:PArallel CodeMixed dAta generatioN for POS tagging</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arindam</namePart>
<namePart type="family">Chatterjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chhavi</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayush</namePart>
<namePart type="family">Raj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Shad</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Delhi, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code-mixing or Code-switching is the mixing of languages in the same context, predominantly observed in multilingual societies. The existing code-mixed datasets are small and primarily contain social media text that does not adhere to standard spelling and grammar. Computational models built on such data fail to generalise on unseen code-mixed data. To address the unavailability of quality code-mixed annotated datasets, we explore the combined task of generating annotated code mixed data, and building computational models from this generated data, specifically for code-mixed Part-Of-Speech (POS) tagging. We introduce PACMAN(PArallel CodeMixed dAta generatioN) - a synthetically generated code-mixed POS tagged dataset, with above 50K samples, which is the largest annotated code-mixed dataset. We build POS taggers using classical machine learning and deep learning based techniques on the generated data to report an F1-score of 98% (8% above current State-of-the-art (SOTA)). To determine the efficacy of our data, we compare it against the existing benchmark in code-mixed POS tagging. PACMAN outperforms the benchmark, ratifying that our dataset and, subsequently, our POS tagging models are generalised and capable of handling even natural code-mixed and monolingual data.</abstract>
<identifier type="citekey">chatterjee-etal-2022-pacman</identifier>
<location>
<url>https://aclanthology.org/2022.icon-main.29</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>234</start>
<end>244</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PACMAN:PArallel CodeMixed dAta generatioN for POS tagging
%A Chatterjee, Arindam
%A Sharma, Chhavi
%A Raj, Ayush
%A Ekbal, Asif
%Y Akhtar, Md. Shad
%Y Chakraborty, Tanmoy
%S Proceedings of the 19th International Conference on Natural Language Processing (ICON)
%D 2022
%8 December
%I Association for Computational Linguistics
%C New Delhi, India
%F chatterjee-etal-2022-pacman
%X Code-mixing or Code-switching is the mixing of languages in the same context, predominantly observed in multilingual societies. The existing code-mixed datasets are small and primarily contain social media text that does not adhere to standard spelling and grammar. Computational models built on such data fail to generalise on unseen code-mixed data. To address the unavailability of quality code-mixed annotated datasets, we explore the combined task of generating annotated code mixed data, and building computational models from this generated data, specifically for code-mixed Part-Of-Speech (POS) tagging. We introduce PACMAN(PArallel CodeMixed dAta generatioN) - a synthetically generated code-mixed POS tagged dataset, with above 50K samples, which is the largest annotated code-mixed dataset. We build POS taggers using classical machine learning and deep learning based techniques on the generated data to report an F1-score of 98% (8% above current State-of-the-art (SOTA)). To determine the efficacy of our data, we compare it against the existing benchmark in code-mixed POS tagging. PACMAN outperforms the benchmark, ratifying that our dataset and, subsequently, our POS tagging models are generalised and capable of handling even natural code-mixed and monolingual data.
%U https://aclanthology.org/2022.icon-main.29
%P 234-244
Markdown (Informal)
[PACMAN:PArallel CodeMixed dAta generatioN for POS tagging](https://aclanthology.org/2022.icon-main.29) (Chatterjee et al., ICON 2022)
ACL
- Arindam Chatterjee, Chhavi Sharma, Ayush Raj, and Asif Ekbal. 2022. PACMAN:PArallel CodeMixed dAta generatioN for POS tagging. In Proceedings of the 19th International Conference on Natural Language Processing (ICON), pages 234–244, New Delhi, India. Association for Computational Linguistics.