@inproceedings{barkarson-steingrimsson-2019-compiling,
title = "Compiling and Filtering {P}ar{I}ce: An {E}nglish-{I}celandic Parallel Corpus",
author = "Barkarson, Starka{\dh}ur and
Steingr{\'\i}msson, Stein{\th}{\'o}r",
editor = "Hartmann, Mareike and
Plank, Barbara",
booktitle = "Proceedings of the 22nd Nordic Conference on Computational Linguistics",
month = sep # "{--}" # oct,
year = "2019",
address = "Turku, Finland",
publisher = {Link{\"o}ping University Electronic Press},
url = "https://aclanthology.org/W19-6115",
pages = "140--145",
abstract = "We present ParIce, a new English-Icelandic parallel corpus. This is the first parallel corpus built for the purposes of language technology development and research for Icelandic, although some Icelandic texts can be found in various other multilingual parallel corpora. We map out which Icelandic texts are available for these purposes, collect aligned data and align other bilingual texts we acquired. We describe the alignment process and how we filter the data to weed out noise and bad alignments. In total we collected 43 million Icelandic words in 4.3 million aligned segment pairs, but after filtering, our corpus includes 38.8 million Icelandic words in 3.5 million segment pairs. We estimate that approximately 5{\%} of the corpus data is noise or faulty alignments while more than 50{\%} of the segments we deleted were faulty. We estimate that our filtering process reduced the number of faulty segments in the corpus by more than 60{\%} while only reducing the number of good alignments by approximately 8{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="barkarson-steingrimsson-2019-compiling">
<titleInfo>
<title>Compiling and Filtering ParIce: An English-Icelandic Parallel Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Starka\dhur</namePart>
<namePart type="family">Barkarson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stein\thór</namePart>
<namePart type="family">Steingrímsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-sep–oct</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Nordic Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mareike</namePart>
<namePart type="family">Hartmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Plank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Linköping University Electronic Press</publisher>
<place>
<placeTerm type="text">Turku, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present ParIce, a new English-Icelandic parallel corpus. This is the first parallel corpus built for the purposes of language technology development and research for Icelandic, although some Icelandic texts can be found in various other multilingual parallel corpora. We map out which Icelandic texts are available for these purposes, collect aligned data and align other bilingual texts we acquired. We describe the alignment process and how we filter the data to weed out noise and bad alignments. In total we collected 43 million Icelandic words in 4.3 million aligned segment pairs, but after filtering, our corpus includes 38.8 million Icelandic words in 3.5 million segment pairs. We estimate that approximately 5% of the corpus data is noise or faulty alignments while more than 50% of the segments we deleted were faulty. We estimate that our filtering process reduced the number of faulty segments in the corpus by more than 60% while only reducing the number of good alignments by approximately 8%.</abstract>
<identifier type="citekey">barkarson-steingrimsson-2019-compiling</identifier>
<location>
<url>https://aclanthology.org/W19-6115</url>
</location>
<part>
<date>2019-sep–oct</date>
<extent unit="page">
<start>140</start>
<end>145</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Compiling and Filtering ParIce: An English-Icelandic Parallel Corpus
%A Barkarson, Starka\dhur
%A Steingrímsson, Stein\thór
%Y Hartmann, Mareike
%Y Plank, Barbara
%S Proceedings of the 22nd Nordic Conference on Computational Linguistics
%D 2019
%8 sep–oct
%I Linköping University Electronic Press
%C Turku, Finland
%F barkarson-steingrimsson-2019-compiling
%X We present ParIce, a new English-Icelandic parallel corpus. This is the first parallel corpus built for the purposes of language technology development and research for Icelandic, although some Icelandic texts can be found in various other multilingual parallel corpora. We map out which Icelandic texts are available for these purposes, collect aligned data and align other bilingual texts we acquired. We describe the alignment process and how we filter the data to weed out noise and bad alignments. In total we collected 43 million Icelandic words in 4.3 million aligned segment pairs, but after filtering, our corpus includes 38.8 million Icelandic words in 3.5 million segment pairs. We estimate that approximately 5% of the corpus data is noise or faulty alignments while more than 50% of the segments we deleted were faulty. We estimate that our filtering process reduced the number of faulty segments in the corpus by more than 60% while only reducing the number of good alignments by approximately 8%.
%U https://aclanthology.org/W19-6115
%P 140-145
Markdown (Informal)
[Compiling and Filtering ParIce: An English-Icelandic Parallel Corpus](https://aclanthology.org/W19-6115) (Barkarson & Steingrímsson, NoDaLiDa 2019)
ACL