@inproceedings{dhar-etal-2018-enabling,
title = "Enabling Code-Mixed Translation: Parallel Corpus Creation and {MT} Augmentation Approach",
author = "Dhar, Mrinal and
Kumar, Vaibhav and
Shrivastava, Manish",
editor = "Machonis, Peter and
Barreiro, Anabela and
Kocijan, Kristina and
Silberztein, Max",
booktitle = "Proceedings of the First Workshop on Linguistic Resources for Natural Language Processing",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W18-3817",
pages = "131--140",
abstract = "Code-mixing, use of two or more languages in a single sentence, is ubiquitous; generated by multi-lingual speakers across the world. The phenomenon presents itself prominently in social media discourse. Consequently, there is a growing need for translating code-mixed hybrid language into standard languages. However, due to the lack of gold parallel data, existing machine translation systems fail to properly translate code-mixed text. In an effort to initiate the task of machine translation of code-mixed content, we present a newly created parallel corpus of code-mixed English-Hindi and English. We selected previously available English-Hindi code-mixed data as a starting point for the creation of our parallel corpus. We then chose 4 human translators, fluent in both English and Hindi, for translating the 6088 code-mixed English-Hindi sentences to English. With the help of the created parallel corpus, we analyzed the structure of English-Hindi code-mixed data and present a technique to augment run-of-the-mill machine translation (MT) approaches that can help achieve superior translations without the need for specially designed translation systems. We present an augmentation pipeline for existing MT approaches, like Phrase Based MT (Moses) and Neural MT, to improve the translation of code-mixed text. The augmentation pipeline is presented as a pre-processing step and can be plugged with any existing MT system, which we demonstrate by improving translations done by systems like Moses, Google Neural Machine Translation System (NMTS) and Bing Translator for English-Hindi code-mixed content.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dhar-etal-2018-enabling">
<titleInfo>
<title>Enabling Code-Mixed Translation: Parallel Corpus Creation and MT Augmentation Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mrinal</namePart>
<namePart type="family">Dhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vaibhav</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manish</namePart>
<namePart type="family">Shrivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Linguistic Resources for Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Machonis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anabela</namePart>
<namePart type="family">Barreiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristina</namePart>
<namePart type="family">Kocijan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Silberztein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code-mixing, use of two or more languages in a single sentence, is ubiquitous; generated by multi-lingual speakers across the world. The phenomenon presents itself prominently in social media discourse. Consequently, there is a growing need for translating code-mixed hybrid language into standard languages. However, due to the lack of gold parallel data, existing machine translation systems fail to properly translate code-mixed text. In an effort to initiate the task of machine translation of code-mixed content, we present a newly created parallel corpus of code-mixed English-Hindi and English. We selected previously available English-Hindi code-mixed data as a starting point for the creation of our parallel corpus. We then chose 4 human translators, fluent in both English and Hindi, for translating the 6088 code-mixed English-Hindi sentences to English. With the help of the created parallel corpus, we analyzed the structure of English-Hindi code-mixed data and present a technique to augment run-of-the-mill machine translation (MT) approaches that can help achieve superior translations without the need for specially designed translation systems. We present an augmentation pipeline for existing MT approaches, like Phrase Based MT (Moses) and Neural MT, to improve the translation of code-mixed text. The augmentation pipeline is presented as a pre-processing step and can be plugged with any existing MT system, which we demonstrate by improving translations done by systems like Moses, Google Neural Machine Translation System (NMTS) and Bing Translator for English-Hindi code-mixed content.</abstract>
<identifier type="citekey">dhar-etal-2018-enabling</identifier>
<location>
<url>https://aclanthology.org/W18-3817</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>131</start>
<end>140</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enabling Code-Mixed Translation: Parallel Corpus Creation and MT Augmentation Approach
%A Dhar, Mrinal
%A Kumar, Vaibhav
%A Shrivastava, Manish
%Y Machonis, Peter
%Y Barreiro, Anabela
%Y Kocijan, Kristina
%Y Silberztein, Max
%S Proceedings of the First Workshop on Linguistic Resources for Natural Language Processing
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F dhar-etal-2018-enabling
%X Code-mixing, use of two or more languages in a single sentence, is ubiquitous; generated by multi-lingual speakers across the world. The phenomenon presents itself prominently in social media discourse. Consequently, there is a growing need for translating code-mixed hybrid language into standard languages. However, due to the lack of gold parallel data, existing machine translation systems fail to properly translate code-mixed text. In an effort to initiate the task of machine translation of code-mixed content, we present a newly created parallel corpus of code-mixed English-Hindi and English. We selected previously available English-Hindi code-mixed data as a starting point for the creation of our parallel corpus. We then chose 4 human translators, fluent in both English and Hindi, for translating the 6088 code-mixed English-Hindi sentences to English. With the help of the created parallel corpus, we analyzed the structure of English-Hindi code-mixed data and present a technique to augment run-of-the-mill machine translation (MT) approaches that can help achieve superior translations without the need for specially designed translation systems. We present an augmentation pipeline for existing MT approaches, like Phrase Based MT (Moses) and Neural MT, to improve the translation of code-mixed text. The augmentation pipeline is presented as a pre-processing step and can be plugged with any existing MT system, which we demonstrate by improving translations done by systems like Moses, Google Neural Machine Translation System (NMTS) and Bing Translator for English-Hindi code-mixed content.
%U https://aclanthology.org/W18-3817
%P 131-140
Markdown (Informal)
[Enabling Code-Mixed Translation: Parallel Corpus Creation and MT Augmentation Approach](https://aclanthology.org/W18-3817) (Dhar et al., LR4NLP 2018)
ACL