@inproceedings{singh-etal-2018-twitter,
title = "A {T}witter Corpus for {H}indi-{E}nglish Code Mixed {POS} Tagging",
author = "Singh, Kushagra and
Sen, Indira and
Kumaraguru, Ponnurangam",
editor = "Ku, Lun-Wei and
Li, Cheng-Te",
booktitle = "Proceedings of the Sixth International Workshop on Natural Language Processing for Social Media",
month = jul,
year = "2018",
address = "Melbourne, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-3503",
doi = "10.18653/v1/W18-3503",
pages = "12--17",
abstract = "Code-mixing is a linguistic phenomenon where multiple languages are used in the same occurrence that is increasingly common in multilingual societies. Code-mixed content on social media is also on the rise, prompting the need for tools to automatically understand such content. Automatic Parts-of-Speech (POS) tagging is an essential step in any Natural Language Processing (NLP) pipeline, but there is a lack of annotated data to train such models. In this work, we present a unique language tagged and POS-tagged dataset of code-mixed English-Hindi tweets related to five incidents in India that led to a lot of Twitter activity. Our dataset is unique in two dimensions: (i) it is larger than previous annotated datasets and (ii) it closely resembles typical real-world tweets. Additionally, we present a POS tagging model that is trained on this dataset to provide an example of how this dataset can be used. The model also shows the efficacy of our dataset in enabling the creation of code-mixed social media POS taggers.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="singh-etal-2018-twitter">
<titleInfo>
<title>A Twitter Corpus for Hindi-English Code Mixed POS Tagging</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kushagra</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Indira</namePart>
<namePart type="family">Sen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ponnurangam</namePart>
<namePart type="family">Kumaraguru</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Sixth International Workshop on Natural Language Processing for Social Media</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng-Te</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Melbourne, Australia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code-mixing is a linguistic phenomenon where multiple languages are used in the same occurrence that is increasingly common in multilingual societies. Code-mixed content on social media is also on the rise, prompting the need for tools to automatically understand such content. Automatic Parts-of-Speech (POS) tagging is an essential step in any Natural Language Processing (NLP) pipeline, but there is a lack of annotated data to train such models. In this work, we present a unique language tagged and POS-tagged dataset of code-mixed English-Hindi tweets related to five incidents in India that led to a lot of Twitter activity. Our dataset is unique in two dimensions: (i) it is larger than previous annotated datasets and (ii) it closely resembles typical real-world tweets. Additionally, we present a POS tagging model that is trained on this dataset to provide an example of how this dataset can be used. The model also shows the efficacy of our dataset in enabling the creation of code-mixed social media POS taggers.</abstract>
<identifier type="citekey">singh-etal-2018-twitter</identifier>
<identifier type="doi">10.18653/v1/W18-3503</identifier>
<location>
<url>https://aclanthology.org/W18-3503</url>
</location>
<part>
<date>2018-07</date>
<extent unit="page">
<start>12</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Twitter Corpus for Hindi-English Code Mixed POS Tagging
%A Singh, Kushagra
%A Sen, Indira
%A Kumaraguru, Ponnurangam
%Y Ku, Lun-Wei
%Y Li, Cheng-Te
%S Proceedings of the Sixth International Workshop on Natural Language Processing for Social Media
%D 2018
%8 July
%I Association for Computational Linguistics
%C Melbourne, Australia
%F singh-etal-2018-twitter
%X Code-mixing is a linguistic phenomenon where multiple languages are used in the same occurrence that is increasingly common in multilingual societies. Code-mixed content on social media is also on the rise, prompting the need for tools to automatically understand such content. Automatic Parts-of-Speech (POS) tagging is an essential step in any Natural Language Processing (NLP) pipeline, but there is a lack of annotated data to train such models. In this work, we present a unique language tagged and POS-tagged dataset of code-mixed English-Hindi tweets related to five incidents in India that led to a lot of Twitter activity. Our dataset is unique in two dimensions: (i) it is larger than previous annotated datasets and (ii) it closely resembles typical real-world tweets. Additionally, we present a POS tagging model that is trained on this dataset to provide an example of how this dataset can be used. The model also shows the efficacy of our dataset in enabling the creation of code-mixed social media POS taggers.
%R 10.18653/v1/W18-3503
%U https://aclanthology.org/W18-3503
%U https://doi.org/10.18653/v1/W18-3503
%P 12-17
Markdown (Informal)
[A Twitter Corpus for Hindi-English Code Mixed POS Tagging](https://aclanthology.org/W18-3503) (Singh et al., SocialNLP 2018)
ACL
- Kushagra Singh, Indira Sen, and Ponnurangam Kumaraguru. 2018. A Twitter Corpus for Hindi-English Code Mixed POS Tagging. In Proceedings of the Sixth International Workshop on Natural Language Processing for Social Media, pages 12–17, Melbourne, Australia. Association for Computational Linguistics.