@inproceedings{steingrimsson-etal-2021-effective,
title = "Effective Bitext Extraction From Comparable Corpora Using a Combination of Three Different Approaches",
author = "Steingr{\'\i}msson, Stein{\th}{\'o}r and
Lohar, Pintu and
Loftsson, Hrafn and
Way, Andy",
editor = "Rapp, Reinhard and
Sharoff, Serge and
Zweigenbaum, Pierre",
booktitle = "Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)",
month = sep,
year = "2021",
address = "Online (Virtual Mode)",
publisher = "INCOMA Ltd.",
url = "https://aclanthology.org/2021.bucc-1.3",
pages = "8--17",
abstract = "Parallel sentences extracted from comparable corpora can be useful to supplement parallel corpora when training machine translation (MT) systems. This is even more prominent in low-resource scenarios, where parallel corpora are scarce. In this paper, we present a system which uses three very different measures to identify and score parallel sentences from comparable corpora. We measure the accuracy of our methods in low-resource settings by comparing the results against manually curated test data for English{--}Icelandic, and by evaluating an MT system trained on the concatenation of the parallel data extracted by our approach and an existing data set. We show that the system is capable of extracting useful parallel sentences with high accuracy, and that the extracted pairs substantially increase translation quality of an MT system trained on the data, as measured by automatic evaluation metrics.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="steingrimsson-etal-2021-effective">
<titleInfo>
<title>Effective Bitext Extraction From Comparable Corpora Using a Combination of Three Different Approaches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stein\thór</namePart>
<namePart type="family">Steingrímsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pintu</namePart>
<namePart type="family">Lohar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andy</namePart>
<namePart type="family">Way</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Sharoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Online (Virtual Mode)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Parallel sentences extracted from comparable corpora can be useful to supplement parallel corpora when training machine translation (MT) systems. This is even more prominent in low-resource scenarios, where parallel corpora are scarce. In this paper, we present a system which uses three very different measures to identify and score parallel sentences from comparable corpora. We measure the accuracy of our methods in low-resource settings by comparing the results against manually curated test data for English–Icelandic, and by evaluating an MT system trained on the concatenation of the parallel data extracted by our approach and an existing data set. We show that the system is capable of extracting useful parallel sentences with high accuracy, and that the extracted pairs substantially increase translation quality of an MT system trained on the data, as measured by automatic evaluation metrics.</abstract>
<identifier type="citekey">steingrimsson-etal-2021-effective</identifier>
<location>
<url>https://aclanthology.org/2021.bucc-1.3</url>
</location>
<part>
<date>2021-09</date>
<extent unit="page">
<start>8</start>
<end>17</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Effective Bitext Extraction From Comparable Corpora Using a Combination of Three Different Approaches
%A Steingrímsson, Stein\thór
%A Lohar, Pintu
%A Loftsson, Hrafn
%A Way, Andy
%Y Rapp, Reinhard
%Y Sharoff, Serge
%Y Zweigenbaum, Pierre
%S Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)
%D 2021
%8 September
%I INCOMA Ltd.
%C Online (Virtual Mode)
%F steingrimsson-etal-2021-effective
%X Parallel sentences extracted from comparable corpora can be useful to supplement parallel corpora when training machine translation (MT) systems. This is even more prominent in low-resource scenarios, where parallel corpora are scarce. In this paper, we present a system which uses three very different measures to identify and score parallel sentences from comparable corpora. We measure the accuracy of our methods in low-resource settings by comparing the results against manually curated test data for English–Icelandic, and by evaluating an MT system trained on the concatenation of the parallel data extracted by our approach and an existing data set. We show that the system is capable of extracting useful parallel sentences with high accuracy, and that the extracted pairs substantially increase translation quality of an MT system trained on the data, as measured by automatic evaluation metrics.
%U https://aclanthology.org/2021.bucc-1.3
%P 8-17
Markdown (Informal)
[Effective Bitext Extraction From Comparable Corpora Using a Combination of Three Different Approaches](https://aclanthology.org/2021.bucc-1.3) (Steingrímsson et al., BUCC 2021)
ACL