@inproceedings{ahmed-etal-2023-guit,
title = "{GUIT}-{NLP}{'}s Submission to Shared Task: Low Resource {I}ndic Language Translation",
author = "Ahmed, Mazida and
Talukdar, Kuwali and
Boruah, Parvez and
Sarma, Prof. Shikhar Kumar and
Kashyap, Kishore",
editor = "Koehn, Philipp and
Haddow, Barry and
Kocmi, Tom and
Monz, Christof",
booktitle = "Proceedings of the Eighth Conference on Machine Translation",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.wmt-1.87",
doi = "10.18653/v1/2023.wmt-1.87",
pages = "935--940",
abstract = "This paper describes the submission of the GUIT-NLP team in the {``}Shared Task: Low Resource Indic Language Translation{''} focusing on three low-resource language pairs: English-Mizo, English-Khasi, and English-Assamese. The initial phase involves an in-depth exploration of Neural Machine Translation (NMT) techniques tailored to the available data. Within this investigation, various Subword Tokenization approaches, model configurations (exploring differnt hyper-parameters etc.) of the general NMT pipeline are tested to identify the most effective method. Subsequently, we address the challenge of low-resource languages by leveraging monolingual data through an innovative and systematic application of the Back Translation technique for English-Mizo. During model training, the monolingual data is progressively integrated into the original bilingual dataset, with each iteration yielding higher-quality back translations. This iterative approach significantly enhances the model{'}s performance, resulting in a notable increase of +3.65 in BLEU scores. Further improvements of +5.59 are achieved through fine-tuning using authentic parallel data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmed-etal-2023-guit">
<titleInfo>
<title>GUIT-NLP’s Submission to Shared Task: Low Resource Indic Language Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mazida</namePart>
<namePart type="family">Ahmed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kuwali</namePart>
<namePart type="family">Talukdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parvez</namePart>
<namePart type="family">Boruah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prof.</namePart>
<namePart type="given">Shikhar</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Sarma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kishore</namePart>
<namePart type="family">Kashyap</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the submission of the GUIT-NLP team in the “Shared Task: Low Resource Indic Language Translation” focusing on three low-resource language pairs: English-Mizo, English-Khasi, and English-Assamese. The initial phase involves an in-depth exploration of Neural Machine Translation (NMT) techniques tailored to the available data. Within this investigation, various Subword Tokenization approaches, model configurations (exploring differnt hyper-parameters etc.) of the general NMT pipeline are tested to identify the most effective method. Subsequently, we address the challenge of low-resource languages by leveraging monolingual data through an innovative and systematic application of the Back Translation technique for English-Mizo. During model training, the monolingual data is progressively integrated into the original bilingual dataset, with each iteration yielding higher-quality back translations. This iterative approach significantly enhances the model’s performance, resulting in a notable increase of +3.65 in BLEU scores. Further improvements of +5.59 are achieved through fine-tuning using authentic parallel data.</abstract>
<identifier type="citekey">ahmed-etal-2023-guit</identifier>
<identifier type="doi">10.18653/v1/2023.wmt-1.87</identifier>
<location>
<url>https://aclanthology.org/2023.wmt-1.87</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>935</start>
<end>940</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GUIT-NLP’s Submission to Shared Task: Low Resource Indic Language Translation
%A Ahmed, Mazida
%A Talukdar, Kuwali
%A Boruah, Parvez
%A Sarma, Prof. Shikhar Kumar
%A Kashyap, Kishore
%Y Koehn, Philipp
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Monz, Christof
%S Proceedings of the Eighth Conference on Machine Translation
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F ahmed-etal-2023-guit
%X This paper describes the submission of the GUIT-NLP team in the “Shared Task: Low Resource Indic Language Translation” focusing on three low-resource language pairs: English-Mizo, English-Khasi, and English-Assamese. The initial phase involves an in-depth exploration of Neural Machine Translation (NMT) techniques tailored to the available data. Within this investigation, various Subword Tokenization approaches, model configurations (exploring differnt hyper-parameters etc.) of the general NMT pipeline are tested to identify the most effective method. Subsequently, we address the challenge of low-resource languages by leveraging monolingual data through an innovative and systematic application of the Back Translation technique for English-Mizo. During model training, the monolingual data is progressively integrated into the original bilingual dataset, with each iteration yielding higher-quality back translations. This iterative approach significantly enhances the model’s performance, resulting in a notable increase of +3.65 in BLEU scores. Further improvements of +5.59 are achieved through fine-tuning using authentic parallel data.
%R 10.18653/v1/2023.wmt-1.87
%U https://aclanthology.org/2023.wmt-1.87
%U https://doi.org/10.18653/v1/2023.wmt-1.87
%P 935-940
Markdown (Informal)
[GUIT-NLP’s Submission to Shared Task: Low Resource Indic Language Translation](https://aclanthology.org/2023.wmt-1.87) (Ahmed et al., WMT 2023)
ACL