@inproceedings{appicharla-etal-2021-edumt,
title = "{E}du{MT}: Developing Machine Translation System for Educational Content in {I}ndian Languages",
author = "Appicharla, Ramakrishna and
Ekbal, Asif and
Bhattacharyya, Pushpak",
editor = "Bandyopadhyay, Sivaji and
Devi, Sobha Lalitha and
Bhattacharyya, Pushpak",
booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2021",
address = "National Institute of Technology Silchar, Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.icon-main.6",
pages = "35--43",
abstract = "In this paper, we explore various approaches to build Hindi to Bengali Neural Machine Translation (NMT) systems for the educational domain. Translation of educational content poses several challenges, such as unavailability of gold standard data for model building, extensive uses of domain-specific terms, as well as the presence of noise in the form of spontaneous speech as the corpus is prepared from subtitle data and noise due to the process of corpus creation through back-translation. We create an educational parallel corpus by crawling lecture subtitles and translating them into Hindi and Bengali using Google translate. We also create a clean parallel corpus by post-editing synthetic corpus via annotation and crowd-sourcing. We build NMT systems on the prepared corpus with domain adaptation objectives. We also explore data augmentation methods by automatically cleaning synthetic corpus and using it to further train the models. We experiment with combining domain adaptation objective with multilingual NMT. We report BLEU and TER scores of all the models on a manually created Hindi-Bengali educational testset. Our experiments show that the multilingual domain adaptation model outperforms all the other models by achieving 34.8 BLEU and 0.466 TER scores.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="appicharla-etal-2021-edumt">
<titleInfo>
<title>EduMT: Developing Machine Translation System for Educational Content in Indian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ramakrishna</namePart>
<namePart type="family">Appicharla</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sivaji</namePart>
<namePart type="family">Bandyopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="given">Lalitha</namePart>
<namePart type="family">Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">National Institute of Technology Silchar, Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we explore various approaches to build Hindi to Bengali Neural Machine Translation (NMT) systems for the educational domain. Translation of educational content poses several challenges, such as unavailability of gold standard data for model building, extensive uses of domain-specific terms, as well as the presence of noise in the form of spontaneous speech as the corpus is prepared from subtitle data and noise due to the process of corpus creation through back-translation. We create an educational parallel corpus by crawling lecture subtitles and translating them into Hindi and Bengali using Google translate. We also create a clean parallel corpus by post-editing synthetic corpus via annotation and crowd-sourcing. We build NMT systems on the prepared corpus with domain adaptation objectives. We also explore data augmentation methods by automatically cleaning synthetic corpus and using it to further train the models. We experiment with combining domain adaptation objective with multilingual NMT. We report BLEU and TER scores of all the models on a manually created Hindi-Bengali educational testset. Our experiments show that the multilingual domain adaptation model outperforms all the other models by achieving 34.8 BLEU and 0.466 TER scores.</abstract>
<identifier type="citekey">appicharla-etal-2021-edumt</identifier>
<location>
<url>https://aclanthology.org/2021.icon-main.6</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>35</start>
<end>43</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EduMT: Developing Machine Translation System for Educational Content in Indian Languages
%A Appicharla, Ramakrishna
%A Ekbal, Asif
%A Bhattacharyya, Pushpak
%Y Bandyopadhyay, Sivaji
%Y Devi, Sobha Lalitha
%Y Bhattacharyya, Pushpak
%S Proceedings of the 18th International Conference on Natural Language Processing (ICON)
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C National Institute of Technology Silchar, Silchar, India
%F appicharla-etal-2021-edumt
%X In this paper, we explore various approaches to build Hindi to Bengali Neural Machine Translation (NMT) systems for the educational domain. Translation of educational content poses several challenges, such as unavailability of gold standard data for model building, extensive uses of domain-specific terms, as well as the presence of noise in the form of spontaneous speech as the corpus is prepared from subtitle data and noise due to the process of corpus creation through back-translation. We create an educational parallel corpus by crawling lecture subtitles and translating them into Hindi and Bengali using Google translate. We also create a clean parallel corpus by post-editing synthetic corpus via annotation and crowd-sourcing. We build NMT systems on the prepared corpus with domain adaptation objectives. We also explore data augmentation methods by automatically cleaning synthetic corpus and using it to further train the models. We experiment with combining domain adaptation objective with multilingual NMT. We report BLEU and TER scores of all the models on a manually created Hindi-Bengali educational testset. Our experiments show that the multilingual domain adaptation model outperforms all the other models by achieving 34.8 BLEU and 0.466 TER scores.
%U https://aclanthology.org/2021.icon-main.6
%P 35-43
Markdown (Informal)
[EduMT: Developing Machine Translation System for Educational Content in Indian Languages](https://aclanthology.org/2021.icon-main.6) (Appicharla et al., ICON 2021)
ACL