@inproceedings{kodali-etal-2022-symcom,
title = "{S}y{MC}o{M} - Syntactic Measure of Code Mixing A Study Of {E}nglish-{H}indi Code-Mixing",
author = "Kodali, Prashant and
Goel, Anmol and
Choudhury, Monojit and
Shrivastava, Manish and
Kumaraguru, Ponnurangam",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.40",
doi = "10.18653/v1/2022.findings-acl.40",
pages = "472--480",
abstract = "Code mixing is the linguistic phenomenon where bilingual speakers tend to switch between two or more languages in conversations. Recent work on code-mixing in computational settings has leveraged social media code mixed texts to train NLP models. For capturing the variety of code mixing in, and across corpus, Language ID (LID) tags based measures (CMI) have been proposed. Syntactical variety/patterns of code-mixing and their relationship vis-a-vis computational model{'}s performance is under explored. In this work, we investigate a collection of English(en)-Hindi(hi) code-mixed datasets from a syntactic lens to propose, $SyMCoM$, an indicator of syntactic variety in code-mixed text, with intuitive theoretical bounds. We train SoTA en-hi PoS tagger, accuracy of 93.4{\%}, to reliably compute PoS tags on a corpus, and demonstrate the utility of $SyMCoM$ by applying it on various syntactical categories on a collection of datasets, and compare datasets using the measure.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kodali-etal-2022-symcom">
<titleInfo>
<title>SyMCoM - Syntactic Measure of Code Mixing A Study Of English-Hindi Code-Mixing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prashant</namePart>
<namePart type="family">Kodali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anmol</namePart>
<namePart type="family">Goel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Monojit</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manish</namePart>
<namePart type="family">Shrivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ponnurangam</namePart>
<namePart type="family">Kumaraguru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aline</namePart>
<namePart type="family">Villavicencio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code mixing is the linguistic phenomenon where bilingual speakers tend to switch between two or more languages in conversations. Recent work on code-mixing in computational settings has leveraged social media code mixed texts to train NLP models. For capturing the variety of code mixing in, and across corpus, Language ID (LID) tags based measures (CMI) have been proposed. Syntactical variety/patterns of code-mixing and their relationship vis-a-vis computational model’s performance is under explored. In this work, we investigate a collection of English(en)-Hindi(hi) code-mixed datasets from a syntactic lens to propose, SyMCoM, an indicator of syntactic variety in code-mixed text, with intuitive theoretical bounds. We train SoTA en-hi PoS tagger, accuracy of 93.4%, to reliably compute PoS tags on a corpus, and demonstrate the utility of SyMCoM by applying it on various syntactical categories on a collection of datasets, and compare datasets using the measure.</abstract>
<identifier type="citekey">kodali-etal-2022-symcom</identifier>
<identifier type="doi">10.18653/v1/2022.findings-acl.40</identifier>
<location>
<url>https://aclanthology.org/2022.findings-acl.40</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>472</start>
<end>480</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SyMCoM - Syntactic Measure of Code Mixing A Study Of English-Hindi Code-Mixing
%A Kodali, Prashant
%A Goel, Anmol
%A Choudhury, Monojit
%A Shrivastava, Manish
%A Kumaraguru, Ponnurangam
%Y Muresan, Smaranda
%Y Nakov, Preslav
%Y Villavicencio, Aline
%S Findings of the Association for Computational Linguistics: ACL 2022
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F kodali-etal-2022-symcom
%X Code mixing is the linguistic phenomenon where bilingual speakers tend to switch between two or more languages in conversations. Recent work on code-mixing in computational settings has leveraged social media code mixed texts to train NLP models. For capturing the variety of code mixing in, and across corpus, Language ID (LID) tags based measures (CMI) have been proposed. Syntactical variety/patterns of code-mixing and their relationship vis-a-vis computational model’s performance is under explored. In this work, we investigate a collection of English(en)-Hindi(hi) code-mixed datasets from a syntactic lens to propose, SyMCoM, an indicator of syntactic variety in code-mixed text, with intuitive theoretical bounds. We train SoTA en-hi PoS tagger, accuracy of 93.4%, to reliably compute PoS tags on a corpus, and demonstrate the utility of SyMCoM by applying it on various syntactical categories on a collection of datasets, and compare datasets using the measure.
%R 10.18653/v1/2022.findings-acl.40
%U https://aclanthology.org/2022.findings-acl.40
%U https://doi.org/10.18653/v1/2022.findings-acl.40
%P 472-480
Markdown (Informal)
[SyMCoM - Syntactic Measure of Code Mixing A Study Of English-Hindi Code-Mixing](https://aclanthology.org/2022.findings-acl.40) (Kodali et al., Findings 2022)
ACL