@inproceedings{huidrom-etal-2021-em,
title = "{EM} Corpus: a comparable corpus for a less-resourced language pair {M}anipuri-{E}nglish",
author = "Huidrom, Rudali and
Lepage, Yves and
Khomdram, Khogendra",
editor = "Rapp, Reinhard and
Sharoff, Serge and
Zweigenbaum, Pierre",
booktitle = "Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)",
month = sep,
year = "2021",
address = "Online (Virtual Mode)",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.bucc-1.8/",
pages = "60--67",
abstract = "In this paper, we introduce a sentence-level comparable text corpus crawled and created for the less-resourced language pair, Manipuri(mni) and English (eng). Our monolingual corpora comprise 1.88 million Manipuri sentences and 1.45 million English sentences, and our parallel corpus comprises 124,975 Manipuri-English sentence pairs. These data were crawled and collected over a year from August 2020 to March 2021 from a local newspaper website called {\textquoteleft}The Sangai Express.' The resources reported in this paper are made available to help the low-resourced languages community for MT/NLP tasks."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huidrom-etal-2021-em">
<titleInfo>
<title>EM Corpus: a comparable corpus for a less-resourced language pair Manipuri-English</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rudali</namePart>
<namePart type="family">Huidrom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Lepage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khogendra</namePart>
<namePart type="family">Khomdram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Sharoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Online (Virtual Mode)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce a sentence-level comparable text corpus crawled and created for the less-resourced language pair, Manipuri(mni) and English (eng). Our monolingual corpora comprise 1.88 million Manipuri sentences and 1.45 million English sentences, and our parallel corpus comprises 124,975 Manipuri-English sentence pairs. These data were crawled and collected over a year from August 2020 to March 2021 from a local newspaper website called ‘The Sangai Express.’ The resources reported in this paper are made available to help the low-resourced languages community for MT/NLP tasks.</abstract>
<identifier type="citekey">huidrom-etal-2021-em</identifier>
<location>
<url>https://aclanthology.org/2021.bucc-1.8/</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>60</start>
<end>67</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EM Corpus: a comparable corpus for a less-resourced language pair Manipuri-English
%A Huidrom, Rudali
%A Lepage, Yves
%A Khomdram, Khogendra
%Y Rapp, Reinhard
%Y Sharoff, Serge
%Y Zweigenbaum, Pierre
%S Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Online (Virtual Mode)
%F huidrom-etal-2021-em
%X In this paper, we introduce a sentence-level comparable text corpus crawled and created for the less-resourced language pair, Manipuri(mni) and English (eng). Our monolingual corpora comprise 1.88 million Manipuri sentences and 1.45 million English sentences, and our parallel corpus comprises 124,975 Manipuri-English sentence pairs. These data were crawled and collected over a year from August 2020 to March 2021 from a local newspaper website called ‘The Sangai Express.’ The resources reported in this paper are made available to help the low-resourced languages community for MT/NLP tasks.
%U https://aclanthology.org/2021.bucc-1.8/
%P 60-67
Markdown (Informal)
[EM Corpus: a comparable corpus for a less-resourced language pair Manipuri-English](https://aclanthology.org/2021.bucc-1.8/) (Huidrom et al., BUCC 2021)
ACL