@inproceedings{shah-etal-2012-general,
title = "A General Framework to Weight Heterogeneous Parallel Data for Model Adaptation in Statistical {MT}",
author = {Shah, Kashif and
Barrault, Lo{\"i}c and
Schwenk, Holger},
booktitle = "Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Research Papers",
month = oct # " 28-" # nov # " 1",
year = "2012",
address = "San Diego, California, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2012.amta-papers.21/",
abstract = "The standard procedure to train the translation model of a phrase-based SMT system is to concatenate all available parallel data, to perform word alignment, to extract phrase pairs and to calculate translation probabilities by simple relative frequency. However, parallel data is quite inhomogeneous in many practical applications with respect to several factors like data source, alignment quality, appropriateness to the task, etc. We propose a general framework to take into account these factors during the calculation of the phrase-table, e.g. by better distributing the probability mass of the individual phrase pairs. No additional feature functions are needed. We report results on two well-known tasks: the IWSLT`11 and WMT`11 evaluations, in both conditions translating from English to French. We give detailed results for different functions to weight the bitexts. Our best systems improve a strong baseline by up to one BLEU point without any impact on the computational complexity during training or decoding."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shah-etal-2012-general">
<titleInfo>
<title>A General Framework to Weight Heterogeneous Parallel Data for Model Adaptation in Statistical MT</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kashif</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loïc</namePart>
<namePart type="family">Barrault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Holger</namePart>
<namePart type="family">Schwenk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-oct 28-nov 1</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Research Papers</title>
</titleInfo>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The standard procedure to train the translation model of a phrase-based SMT system is to concatenate all available parallel data, to perform word alignment, to extract phrase pairs and to calculate translation probabilities by simple relative frequency. However, parallel data is quite inhomogeneous in many practical applications with respect to several factors like data source, alignment quality, appropriateness to the task, etc. We propose a general framework to take into account these factors during the calculation of the phrase-table, e.g. by better distributing the probability mass of the individual phrase pairs. No additional feature functions are needed. We report results on two well-known tasks: the IWSLT‘11 and WMT‘11 evaluations, in both conditions translating from English to French. We give detailed results for different functions to weight the bitexts. Our best systems improve a strong baseline by up to one BLEU point without any impact on the computational complexity during training or decoding.</abstract>
<identifier type="citekey">shah-etal-2012-general</identifier>
<location>
<url>https://aclanthology.org/2012.amta-papers.21/</url>
</location>
<part>
<date>2012-oct 28-nov 1</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A General Framework to Weight Heterogeneous Parallel Data for Model Adaptation in Statistical MT
%A Shah, Kashif
%A Barrault, Loïc
%A Schwenk, Holger
%S Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Research Papers
%D 2012
%8 oct 28 nov 1
%I Association for Machine Translation in the Americas
%C San Diego, California, USA
%F shah-etal-2012-general
%X The standard procedure to train the translation model of a phrase-based SMT system is to concatenate all available parallel data, to perform word alignment, to extract phrase pairs and to calculate translation probabilities by simple relative frequency. However, parallel data is quite inhomogeneous in many practical applications with respect to several factors like data source, alignment quality, appropriateness to the task, etc. We propose a general framework to take into account these factors during the calculation of the phrase-table, e.g. by better distributing the probability mass of the individual phrase pairs. No additional feature functions are needed. We report results on two well-known tasks: the IWSLT‘11 and WMT‘11 evaluations, in both conditions translating from English to French. We give detailed results for different functions to weight the bitexts. Our best systems improve a strong baseline by up to one BLEU point without any impact on the computational complexity during training or decoding.
%U https://aclanthology.org/2012.amta-papers.21/
Markdown (Informal)
[A General Framework to Weight Heterogeneous Parallel Data for Model Adaptation in Statistical MT](https://aclanthology.org/2012.amta-papers.21/) (Shah et al., AMTA 2012)
ACL