@inproceedings{rubino-etal-2014-quality,
title = "Quality Estimation for Synthetic Parallel Data Generation",
author = "Rubino, Raphael and
Toral, Antonio and
Ljube{\v{s}}i{\'c}, Nikola and
Ram{\'i}rez-S{\'a}nchez, Gema",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}`14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L14-1625/",
pages = "1843--1849",
abstract = "This paper presents a novel approach for parallel data generation using machine translation and quality estimation. Our study focuses on pivot-based machine translation from English to Croatian through Slovene. We generate an English{\textemdash}Croatian version of the Europarl parallel corpus based on the English{\textemdash}Slovene Europarl corpus and the Apertium rule-based translation system for Slovene{\textemdash}Croatian. These experiments are to be considered as a first step towards the generation of reliable synthetic parallel data for under-resourced languages. We first collect small amounts of aligned parallel data for the Slovene{\textemdash}Croatian language pair in order to build a quality estimation system for sentence-level Translation Edit Rate (TER) estimation. We then infer TER scores on automatically translated Slovene to Croatian sentences and use the best translations to build an English{\textemdash}Croatian statistical MT system. We show significant improvement in terms of automatic metrics obtained on two test sets using our approach compared to a random selection of synthetic parallel data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubino-etal-2014-quality">
<titleInfo>
<title>Quality Estimation for Synthetic Parallel Data Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Raphael</namePart>
<namePart type="family">Rubino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC‘14)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents a novel approach for parallel data generation using machine translation and quality estimation. Our study focuses on pivot-based machine translation from English to Croatian through Slovene. We generate an English—Croatian version of the Europarl parallel corpus based on the English—Slovene Europarl corpus and the Apertium rule-based translation system for Slovene—Croatian. These experiments are to be considered as a first step towards the generation of reliable synthetic parallel data for under-resourced languages. We first collect small amounts of aligned parallel data for the Slovene—Croatian language pair in order to build a quality estimation system for sentence-level Translation Edit Rate (TER) estimation. We then infer TER scores on automatically translated Slovene to Croatian sentences and use the best translations to build an English—Croatian statistical MT system. We show significant improvement in terms of automatic metrics obtained on two test sets using our approach compared to a random selection of synthetic parallel data.</abstract>
<identifier type="citekey">rubino-etal-2014-quality</identifier>
<location>
<url>https://aclanthology.org/L14-1625/</url>
</location>
<part>
<date>2014-05</date>
<extent unit="page">
<start>1843</start>
<end>1849</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quality Estimation for Synthetic Parallel Data Generation
%A Rubino, Raphael
%A Toral, Antonio
%A Ljubešić, Nikola
%A Ramírez-Sánchez, Gema
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Loftsson, Hrafn
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC‘14)
%D 2014
%8 May
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F rubino-etal-2014-quality
%X This paper presents a novel approach for parallel data generation using machine translation and quality estimation. Our study focuses on pivot-based machine translation from English to Croatian through Slovene. We generate an English—Croatian version of the Europarl parallel corpus based on the English—Slovene Europarl corpus and the Apertium rule-based translation system for Slovene—Croatian. These experiments are to be considered as a first step towards the generation of reliable synthetic parallel data for under-resourced languages. We first collect small amounts of aligned parallel data for the Slovene—Croatian language pair in order to build a quality estimation system for sentence-level Translation Edit Rate (TER) estimation. We then infer TER scores on automatically translated Slovene to Croatian sentences and use the best translations to build an English—Croatian statistical MT system. We show significant improvement in terms of automatic metrics obtained on two test sets using our approach compared to a random selection of synthetic parallel data.
%U https://aclanthology.org/L14-1625/
%P 1843-1849
Markdown (Informal)
[Quality Estimation for Synthetic Parallel Data Generation](https://aclanthology.org/L14-1625/) (Rubino et al., LREC 2014)
ACL
- Raphael Rubino, Antonio Toral, Nikola Ljubešić, and Gema Ramírez-Sánchez. 2014. Quality Estimation for Synthetic Parallel Data Generation. In Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14), pages 1843–1849, Reykjavik, Iceland. European Language Resources Association (ELRA).