@inproceedings{laskar-etal-2022-cnlp-nits,
title = "{CNLP}-{NITS}-{PP} at {M}ix{MT} 2022: {H}inglish-{E}nglish Code-Mixed Machine Translation",
author = "Laskar, Sahinur Rahman and
Singh, Rahul and
Pandey, Shyambabu and
Manna, Riyanka and
Pakray, Partha and
Bandyopadhyay, Sivaji",
booktitle = "Proceedings of the Seventh Conference on Machine Translation (WMT)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wmt-1.116",
pages = "1158--1161",
abstract = "The mixing of two or more languages in speech or text is known as code-mixing. In this form of communication, users mix words and phrases from multiple languages. Code-mixing is very common in the context of Indian languages due to the presence of multilingual societies. The probability of the existence of code-mixed sentences in almost all Indian languages since in India English is the dominant language for social media textual communication platforms. We have participated in the WMT22 shared task of code-mixed machine translation with the team name: CNLP-NITS-PP. In this task, we have prepared a synthetic Hinglish{--}English parallel corpus using transliteration of original Hindi sentences to tackle the limitation of the parallel corpus, where, we mainly considered sentences that have named-entity (proper noun) from the available English-Hindi parallel corpus. With the addition of synthetic bi-text data to the original parallel corpus (train set), our transformer-based neural machine translation models have attained recall-oriented understudy for gisting evaluation (ROUGE-L) scores of 0.23815, 0.33729, and word error rate (WER) scores of 0.95458, 0.88451 at Sub-Task-1 (English-to-Hinglish) and Sub-Task-2 (Hinglish-to-English) for test set results respectively.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="laskar-etal-2022-cnlp-nits">
<titleInfo>
<title>CNLP-NITS-PP at MixMT 2022: Hinglish-English Code-Mixed Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sahinur</namePart>
<namePart type="given">Rahman</namePart>
<namePart type="family">Laskar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shyambabu</namePart>
<namePart type="family">Pandey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Riyanka</namePart>
<namePart type="family">Manna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Partha</namePart>
<namePart type="family">Pakray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sivaji</namePart>
<namePart type="family">Bandyopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Conference on Machine Translation (WMT)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The mixing of two or more languages in speech or text is known as code-mixing. In this form of communication, users mix words and phrases from multiple languages. Code-mixing is very common in the context of Indian languages due to the presence of multilingual societies. The probability of the existence of code-mixed sentences in almost all Indian languages since in India English is the dominant language for social media textual communication platforms. We have participated in the WMT22 shared task of code-mixed machine translation with the team name: CNLP-NITS-PP. In this task, we have prepared a synthetic Hinglish–English parallel corpus using transliteration of original Hindi sentences to tackle the limitation of the parallel corpus, where, we mainly considered sentences that have named-entity (proper noun) from the available English-Hindi parallel corpus. With the addition of synthetic bi-text data to the original parallel corpus (train set), our transformer-based neural machine translation models have attained recall-oriented understudy for gisting evaluation (ROUGE-L) scores of 0.23815, 0.33729, and word error rate (WER) scores of 0.95458, 0.88451 at Sub-Task-1 (English-to-Hinglish) and Sub-Task-2 (Hinglish-to-English) for test set results respectively.</abstract>
<identifier type="citekey">laskar-etal-2022-cnlp-nits</identifier>
<location>
<url>https://aclanthology.org/2022.wmt-1.116</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1158</start>
<end>1161</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CNLP-NITS-PP at MixMT 2022: Hinglish-English Code-Mixed Machine Translation
%A Laskar, Sahinur Rahman
%A Singh, Rahul
%A Pandey, Shyambabu
%A Manna, Riyanka
%A Pakray, Partha
%A Bandyopadhyay, Sivaji
%S Proceedings of the Seventh Conference on Machine Translation (WMT)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F laskar-etal-2022-cnlp-nits
%X The mixing of two or more languages in speech or text is known as code-mixing. In this form of communication, users mix words and phrases from multiple languages. Code-mixing is very common in the context of Indian languages due to the presence of multilingual societies. The probability of the existence of code-mixed sentences in almost all Indian languages since in India English is the dominant language for social media textual communication platforms. We have participated in the WMT22 shared task of code-mixed machine translation with the team name: CNLP-NITS-PP. In this task, we have prepared a synthetic Hinglish–English parallel corpus using transliteration of original Hindi sentences to tackle the limitation of the parallel corpus, where, we mainly considered sentences that have named-entity (proper noun) from the available English-Hindi parallel corpus. With the addition of synthetic bi-text data to the original parallel corpus (train set), our transformer-based neural machine translation models have attained recall-oriented understudy for gisting evaluation (ROUGE-L) scores of 0.23815, 0.33729, and word error rate (WER) scores of 0.95458, 0.88451 at Sub-Task-1 (English-to-Hinglish) and Sub-Task-2 (Hinglish-to-English) for test set results respectively.
%U https://aclanthology.org/2022.wmt-1.116
%P 1158-1161
Markdown (Informal)
[CNLP-NITS-PP at MixMT 2022: Hinglish-English Code-Mixed Machine Translation](https://aclanthology.org/2022.wmt-1.116) (Laskar et al., WMT 2022)
ACL