@inproceedings{parvez-etal-2023-neural,
title = "Neural Machine Translation for a Low Resource Language Pair: {E}nglish-{B}odo",
author = "Parvez, Boruah and
Kuwali, Talukdar and
Mazida, Ahmed and
Kishore, Kashyap",
editor = "Jyoti, D. Pawar and
Sobha, Lalitha Devi",
booktitle = "Proceedings of the 20th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2023",
address = "Goa University, Goa, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2023.icon-1.21",
pages = "295--300",
abstract = "This paper represent a work done on Neural Machine Translation for English and Bodo language pair. English is a language spoken around the world whereas, Bodo is a language mostly spoken in North Eastern area of India. This work of machine translation is done on a relatively small size of parallel data as there is less parallel corpus available for english bodo pair. Corpus is generally taken from available source National Platform of Language Technology(NPLT), Data Management Unit(DMU), Mission Bhashini, Ministry of Electronics and Information Technology and also generated internally in-house. Tokenization of raw text is done using IndicNLP library and mosesdecoder for Bodo and English respectively. Subword tokenization is performed by using BPE(Byte Pair Encoder) , Sentencepiece and Wordpiece subword. Experiments have been done on two different vocab size of 8000 and 16000 on a total of around 92410 parallel sentences. Two standard transformer encoder and decoder models with varying number of layers and hidden size are build for training the data using OpenNMT-py framework. The result are evaluated based on the BLEU score on an additional testset for evaluating the performance. The highest BLEU score of 11.01 and 14.62 are achieved on the testset for English to Bodo and Bodo to English translation respectively.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="parvez-etal-2023-neural">
<titleInfo>
<title>Neural Machine Translation for a Low Resource Language Pair: English-Bodo</title>
</titleInfo>
<name type="personal">
<namePart type="given">Boruah</namePart>
<namePart type="family">Parvez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Talukdar</namePart>
<namePart type="family">Kuwali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Mazida</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kashyap</namePart>
<namePart type="family">Kishore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">D</namePart>
<namePart type="given">Pawar</namePart>
<namePart type="family">Jyoti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lalitha</namePart>
<namePart type="given">Devi</namePart>
<namePart type="family">Sobha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Goa University, Goa, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper represent a work done on Neural Machine Translation for English and Bodo language pair. English is a language spoken around the world whereas, Bodo is a language mostly spoken in North Eastern area of India. This work of machine translation is done on a relatively small size of parallel data as there is less parallel corpus available for english bodo pair. Corpus is generally taken from available source National Platform of Language Technology(NPLT), Data Management Unit(DMU), Mission Bhashini, Ministry of Electronics and Information Technology and also generated internally in-house. Tokenization of raw text is done using IndicNLP library and mosesdecoder for Bodo and English respectively. Subword tokenization is performed by using BPE(Byte Pair Encoder) , Sentencepiece and Wordpiece subword. Experiments have been done on two different vocab size of 8000 and 16000 on a total of around 92410 parallel sentences. Two standard transformer encoder and decoder models with varying number of layers and hidden size are build for training the data using OpenNMT-py framework. The result are evaluated based on the BLEU score on an additional testset for evaluating the performance. The highest BLEU score of 11.01 and 14.62 are achieved on the testset for English to Bodo and Bodo to English translation respectively.</abstract>
<identifier type="citekey">parvez-etal-2023-neural</identifier>
<location>
<url>https://aclanthology.org/2023.icon-1.21</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>295</start>
<end>300</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Neural Machine Translation for a Low Resource Language Pair: English-Bodo
%A Parvez, Boruah
%A Kuwali, Talukdar
%A Mazida, Ahmed
%A Kishore, Kashyap
%Y Jyoti, D. Pawar
%Y Sobha, Lalitha Devi
%S Proceedings of the 20th International Conference on Natural Language Processing (ICON)
%D 2023
%8 December
%I NLP Association of India (NLPAI)
%C Goa University, Goa, India
%F parvez-etal-2023-neural
%X This paper represent a work done on Neural Machine Translation for English and Bodo language pair. English is a language spoken around the world whereas, Bodo is a language mostly spoken in North Eastern area of India. This work of machine translation is done on a relatively small size of parallel data as there is less parallel corpus available for english bodo pair. Corpus is generally taken from available source National Platform of Language Technology(NPLT), Data Management Unit(DMU), Mission Bhashini, Ministry of Electronics and Information Technology and also generated internally in-house. Tokenization of raw text is done using IndicNLP library and mosesdecoder for Bodo and English respectively. Subword tokenization is performed by using BPE(Byte Pair Encoder) , Sentencepiece and Wordpiece subword. Experiments have been done on two different vocab size of 8000 and 16000 on a total of around 92410 parallel sentences. Two standard transformer encoder and decoder models with varying number of layers and hidden size are build for training the data using OpenNMT-py framework. The result are evaluated based on the BLEU score on an additional testset for evaluating the performance. The highest BLEU score of 11.01 and 14.62 are achieved on the testset for English to Bodo and Bodo to English translation respectively.
%U https://aclanthology.org/2023.icon-1.21
%P 295-300
Markdown (Informal)
[Neural Machine Translation for a Low Resource Language Pair: English-Bodo](https://aclanthology.org/2023.icon-1.21) (Parvez et al., ICON 2023)
ACL