@inproceedings{barik-etal-2019-normalization,
title = "Normalization of {I}ndonesian-{E}nglish Code-Mixed {T}witter Data",
author = "Barik, Anab Maulana and
Mahendra, Rahmad and
Adriani, Mirna",
editor = "Xu, Wei and
Ritter, Alan and
Baldwin, Tim and
Rahimi, Afshin",
booktitle = "Proceedings of the 5th Workshop on Noisy User-generated Text (W-NUT 2019)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D19-5554",
doi = "10.18653/v1/D19-5554",
pages = "417--424",
abstract = "Twitter is an excellent source of data for NLP researches as it offers tremendous amount of textual data. However, processing tweet to extract meaningful information is very challenging, at least for two reasons: (i) using nonstandard words as well as informal writing manner, and (ii) code-mixing issues, which is combining multiple languages in single tweet conversation. Most of the previous works have addressed both issues in isolated different task. In this study, we work on normalization task in code-mixed Twitter data, more specifically in Indonesian-English language. We propose a pipeline that consists of four modules, i.e tokenization, language identification, lexical normalization, and translation. Another contribution is to provide a gold standard of Indonesian-English code-mixed data for each module.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="barik-etal-2019-normalization">
<titleInfo>
<title>Normalization of Indonesian-English Code-Mixed Twitter Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anab</namePart>
<namePart type="given">Maulana</namePart>
<namePart type="family">Barik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahmad</namePart>
<namePart type="family">Mahendra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mirna</namePart>
<namePart type="family">Adriani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Noisy User-generated Text (W-NUT 2019)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afshin</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hong Kong, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Twitter is an excellent source of data for NLP researches as it offers tremendous amount of textual data. However, processing tweet to extract meaningful information is very challenging, at least for two reasons: (i) using nonstandard words as well as informal writing manner, and (ii) code-mixing issues, which is combining multiple languages in single tweet conversation. Most of the previous works have addressed both issues in isolated different task. In this study, we work on normalization task in code-mixed Twitter data, more specifically in Indonesian-English language. We propose a pipeline that consists of four modules, i.e tokenization, language identification, lexical normalization, and translation. Another contribution is to provide a gold standard of Indonesian-English code-mixed data for each module.</abstract>
<identifier type="citekey">barik-etal-2019-normalization</identifier>
<identifier type="doi">10.18653/v1/D19-5554</identifier>
<location>
<url>https://aclanthology.org/D19-5554</url>
</location>
<part>
<date>2019-11</date>
<extent unit="page">
<start>417</start>
<end>424</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Normalization of Indonesian-English Code-Mixed Twitter Data
%A Barik, Anab Maulana
%A Mahendra, Rahmad
%A Adriani, Mirna
%Y Xu, Wei
%Y Ritter, Alan
%Y Baldwin, Tim
%Y Rahimi, Afshin
%S Proceedings of the 5th Workshop on Noisy User-generated Text (W-NUT 2019)
%D 2019
%8 November
%I Association for Computational Linguistics
%C Hong Kong, China
%F barik-etal-2019-normalization
%X Twitter is an excellent source of data for NLP researches as it offers tremendous amount of textual data. However, processing tweet to extract meaningful information is very challenging, at least for two reasons: (i) using nonstandard words as well as informal writing manner, and (ii) code-mixing issues, which is combining multiple languages in single tweet conversation. Most of the previous works have addressed both issues in isolated different task. In this study, we work on normalization task in code-mixed Twitter data, more specifically in Indonesian-English language. We propose a pipeline that consists of four modules, i.e tokenization, language identification, lexical normalization, and translation. Another contribution is to provide a gold standard of Indonesian-English code-mixed data for each module.
%R 10.18653/v1/D19-5554
%U https://aclanthology.org/D19-5554
%U https://doi.org/10.18653/v1/D19-5554
%P 417-424
Markdown (Informal)
[Normalization of Indonesian-English Code-Mixed Twitter Data](https://aclanthology.org/D19-5554) (Barik et al., WNUT 2019)
ACL