@inproceedings{arindam-etal-2023-lost,
title = "Lost in Translation No More: Fine-tuned transformer-based models for {C}ode{M}ix to {E}nglish Machine Translation",
author = "Chatterjee, Arindam and
Sharma, Chhavi and
V.p., Yashwanth and
Kumar, Niraj and
Raj, Ayush and
Ekbal, Asif",
editor = "D. Pawar, Jyoti and
Lalitha Devi, Sobha",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.25",
pages = "326--335",
abstract = "Codemixing, the linguistic phenomenon where a speaker alternates between two or more languages within a conversation or even a single utterance, presents a significant challenge for machine translation systems due to its syntactic complexity and contextual nuances. This paper introduces a set of advanced transformerbased models fine-tuned specifically for translating codemixed text to English, more specifically, Hindi-English (colloquially referred to as Hinglish) codemixed text into English. Unlike standard bilingual corpora, codemixed data requires an understanding of the intricacies of grammatical structures and cultural contexts embedded within the language blend. Existing machine translation efforts in codemixed languages have largely been constrained by the paucity of robust datasets and models that can capture the nuanced semantic and syntactic interplay characteristic of such languages. We present a novel dataset PACMAN trans for Hinglish to English machine translation, based on the PACMAN strategy, meticulously curated to represent natural codemixing patterns. Our generic fine-tuned translation models trained on the novel data outperforms current state-of-theart Large Language Models (LLMs) by 38{\%} in terms of BLEU score. Further, when fine-tuned on custom benchmark datasets, our focused dual fine-tuned models surpass the PHINC dataset BLEU score benchmark by 22{\%}. Our comparative analysis illustrates significant improvements in translation quality, showcasing the potential of fine-tuning transformer models in bridging the linguistic divide in codemixed language translation. The success of our models reflects a promising step forward in the quest to provide seamless translation services for the ever-growing multilingual population and the complex linguistic phenomena they generate.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arindam-etal-2023-lost">
<titleInfo>
<title>Lost in Translation No More: Fine-tuned transformer-based models for CodeMix to English Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arindam</namePart>
<namePart type="family">Chatterjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chhavi</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yashwanth</namePart>
<namePart type="family">V.p.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niraj</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayush</namePart>
<namePart type="family">Raj</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jyoti</namePart>
<namePart type="family">D. Pawar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Codemixing, the linguistic phenomenon where a speaker alternates between two or more languages within a conversation or even a single utterance, presents a significant challenge for machine translation systems due to its syntactic complexity and contextual nuances. This paper introduces a set of advanced transformerbased models fine-tuned specifically for translating codemixed text to English, more specifically, Hindi-English (colloquially referred to as Hinglish) codemixed text into English. Unlike standard bilingual corpora, codemixed data requires an understanding of the intricacies of grammatical structures and cultural contexts embedded within the language blend. Existing machine translation efforts in codemixed languages have largely been constrained by the paucity of robust datasets and models that can capture the nuanced semantic and syntactic interplay characteristic of such languages. We present a novel dataset PACMAN trans for Hinglish to English machine translation, based on the PACMAN strategy, meticulously curated to represent natural codemixing patterns. Our generic fine-tuned translation models trained on the novel data outperforms current state-of-theart Large Language Models (LLMs) by 38% in terms of BLEU score. Further, when fine-tuned on custom benchmark datasets, our focused dual fine-tuned models surpass the PHINC dataset BLEU score benchmark by 22%. Our comparative analysis illustrates significant improvements in translation quality, showcasing the potential of fine-tuning transformer models in bridging the linguistic divide in codemixed language translation. The success of our models reflects a promising step forward in the quest to provide seamless translation services for the ever-growing multilingual population and the complex linguistic phenomena they generate.</abstract>
<identifier type="citekey">arindam-etal-2023-lost</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.25</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>326</start>
<end>335</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lost in Translation No More: Fine-tuned transformer-based models for CodeMix to English Machine Translation
%A Chatterjee, Arindam
%A Sharma, Chhavi
%A V.p., Yashwanth
%A Kumar, Niraj
%A Raj, Ayush
%A Ekbal, Asif
%Y D. Pawar, Jyoti
%Y Lalitha Devi, Sobha
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F arindam-etal-2023-lost
%X Codemixing, the linguistic phenomenon where a speaker alternates between two or more languages within a conversation or even a single utterance, presents a significant challenge for machine translation systems due to its syntactic complexity and contextual nuances. This paper introduces a set of advanced transformerbased models fine-tuned specifically for translating codemixed text to English, more specifically, Hindi-English (colloquially referred to as Hinglish) codemixed text into English. Unlike standard bilingual corpora, codemixed data requires an understanding of the intricacies of grammatical structures and cultural contexts embedded within the language blend. Existing machine translation efforts in codemixed languages have largely been constrained by the paucity of robust datasets and models that can capture the nuanced semantic and syntactic interplay characteristic of such languages. We present a novel dataset PACMAN trans for Hinglish to English machine translation, based on the PACMAN strategy, meticulously curated to represent natural codemixing patterns. Our generic fine-tuned translation models trained on the novel data outperforms current state-of-theart Large Language Models (LLMs) by 38% in terms of BLEU score. Further, when fine-tuned on custom benchmark datasets, our focused dual fine-tuned models surpass the PHINC dataset BLEU score benchmark by 22%. Our comparative analysis illustrates significant improvements in translation quality, showcasing the potential of fine-tuning transformer models in bridging the linguistic divide in codemixed language translation. The success of our models reflects a promising step forward in the quest to provide seamless translation services for the ever-growing multilingual population and the complex linguistic phenomena they generate.
%U https://aclanthology.org/2023.icon-1.25
%P 326-335
Markdown (Informal)
[Lost in Translation No More: Fine-tuned transformer-based models for CodeMix to English Machine Translation](https://aclanthology.org/2023.icon-1.25) (Chatterjee et al., ICON 2023)
ACL