@inproceedings{steingrimsson-etal-2023-discard,
title = "Do Not Discard {--} Extracting Useful Fragments from Low-Quality Parallel Data to Improve Machine Translation",
author = "Steingr{\'i}msson, Stein{\th}{\'o}r and
Lohar, Pintu and
Loftsson, Hrafn and
Way, Andy",
booktitle = "Proceedings of the Second Workshop on Corpus Generation and Corpus Augmentation for Machine Translation",
month = sep,
year = "2023",
address = "Macau SAR, China",
publisher = "Asia-Pacific Association for Machine Translation",
url = "https://aclanthology.org/2023.mtsummit-coco4mt.1/",
pages = "1--13",
abstract = "When parallel corpora are preprocessed for machine translation (MT) training, a part of the parallel data is commonly discarded and deemed non-parallel due to odd-length ratio, overlapping text in source and target sentences or failing some other form of a semantic equivalency test. For language pairs with limited parallel resources, this can be costly as in such cases modest amounts of acceptable data may be useful to help build MT systems that generate higher quality translations. In this paper, we refine parallel corpora for two language pairs, English{--}Bengali and English{--}Icelandic, by extracting sub-sentence fragments from sentence pairs that would otherwise have been discarded, in order to increase recall when compiling training data. We find that by including the fragments, translation quality of NMT systems trained on the data improves significantly when translating from English to Bengali and from English to Icelandic."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="steingrimsson-etal-2023-discard">
<titleInfo>
<title>Do Not Discard – Extracting Useful Fragments from Low-Quality Parallel Data to Improve Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stein\thór</namePart>
<namePart type="family">Steingrímsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pintu</namePart>
<namePart type="family">Lohar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andy</namePart>
<namePart type="family">Way</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Corpus Generation and Corpus Augmentation for Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Asia-Pacific Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Macau SAR, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When parallel corpora are preprocessed for machine translation (MT) training, a part of the parallel data is commonly discarded and deemed non-parallel due to odd-length ratio, overlapping text in source and target sentences or failing some other form of a semantic equivalency test. For language pairs with limited parallel resources, this can be costly as in such cases modest amounts of acceptable data may be useful to help build MT systems that generate higher quality translations. In this paper, we refine parallel corpora for two language pairs, English–Bengali and English–Icelandic, by extracting sub-sentence fragments from sentence pairs that would otherwise have been discarded, in order to increase recall when compiling training data. We find that by including the fragments, translation quality of NMT systems trained on the data improves significantly when translating from English to Bengali and from English to Icelandic.</abstract>
<identifier type="citekey">steingrimsson-etal-2023-discard</identifier>
<location>
<url>https://aclanthology.org/2023.mtsummit-coco4mt.1/</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>1</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do Not Discard – Extracting Useful Fragments from Low-Quality Parallel Data to Improve Machine Translation
%A Steingrímsson, Stein\thór
%A Lohar, Pintu
%A Loftsson, Hrafn
%A Way, Andy
%S Proceedings of the Second Workshop on Corpus Generation and Corpus Augmentation for Machine Translation
%D 2023
%8 September
%I Asia-Pacific Association for Machine Translation
%C Macau SAR, China
%F steingrimsson-etal-2023-discard
%X When parallel corpora are preprocessed for machine translation (MT) training, a part of the parallel data is commonly discarded and deemed non-parallel due to odd-length ratio, overlapping text in source and target sentences or failing some other form of a semantic equivalency test. For language pairs with limited parallel resources, this can be costly as in such cases modest amounts of acceptable data may be useful to help build MT systems that generate higher quality translations. In this paper, we refine parallel corpora for two language pairs, English–Bengali and English–Icelandic, by extracting sub-sentence fragments from sentence pairs that would otherwise have been discarded, in order to increase recall when compiling training data. We find that by including the fragments, translation quality of NMT systems trained on the data improves significantly when translating from English to Bengali and from English to Icelandic.
%U https://aclanthology.org/2023.mtsummit-coco4mt.1/
%P 1-13
Markdown (Informal)
[Do Not Discard – Extracting Useful Fragments from Low-Quality Parallel Data to Improve Machine Translation](https://aclanthology.org/2023.mtsummit-coco4mt.1/) (Steingrímsson et al., MTSummit 2023)
ACL