@inproceedings{mansour-etal-2011-combining,
title = "Combining translation and language model scoring for domain-specific data filtering",
author = "Mansour, Saab and
Wuebker, Joern and
Ney, Hermann",
editor = {Federico, Marcello and
Hwang, Mei-Yuh and
R{\"o}dder, Margit and
St{\"u}ker, Sebastian},
booktitle = "Proceedings of the 8th International Workshop on Spoken Language Translation: Papers",
month = dec # " 8-9",
year = "2011",
address = "San Francisco, California",
url = "https://aclanthology.org/2011.iwslt-papers.5",
pages = "222--229",
abstract = "The increasing popularity of statistical machine translation (SMT) systems is introducing new domains of translation that need to be tackled. As many resources are already available, domain adaptation methods can be applied to utilize these recourses in the most beneficial way for the new domain. We explore adaptation via filtering, using the crossentropy scores to discard irrelevant sentences. We focus on filtering for two important components of an SMT system, namely the language model (LM) and the translation model (TM). Previous work has already applied LM cross-entropy based scoring for filtering. We argue that LM cross-entropy might be appropriate for LM filtering, but not as much for TM filtering. We develop a novel filtering approach based on a combined TM and LM cross-entropy scores. We experiment with two large-scale translation tasks, the Arabic-to-English and English-to-French IWSLT 2011 TED Talks MT tasks. For LM filtering, we achieve strong perplexity improvements which carry over to the translation quality with improvements up to +0.4{\%} BLEU. For TM filtering, the combined method achieves small but consistent improvements over the standalone methods. As a side effect of adaptation via filtering, the fully fledged SMT system vocabulary size and phrase table size are reduced by a factor of at least 2 while up to +0.6{\%} BLEU improvement is observed.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mansour-etal-2011-combining">
<titleInfo>
<title>Combining translation and language model scoring for domain-specific data filtering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saab</namePart>
<namePart type="family">Mansour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joern</namePart>
<namePart type="family">Wuebker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hermann</namePart>
<namePart type="family">Ney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2011-dec 8-9</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th International Workshop on Spoken Language Translation: Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei-Yuh</namePart>
<namePart type="family">Hwang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margit</namePart>
<namePart type="family">Rödder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Stüker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">San Francisco, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The increasing popularity of statistical machine translation (SMT) systems is introducing new domains of translation that need to be tackled. As many resources are already available, domain adaptation methods can be applied to utilize these recourses in the most beneficial way for the new domain. We explore adaptation via filtering, using the crossentropy scores to discard irrelevant sentences. We focus on filtering for two important components of an SMT system, namely the language model (LM) and the translation model (TM). Previous work has already applied LM cross-entropy based scoring for filtering. We argue that LM cross-entropy might be appropriate for LM filtering, but not as much for TM filtering. We develop a novel filtering approach based on a combined TM and LM cross-entropy scores. We experiment with two large-scale translation tasks, the Arabic-to-English and English-to-French IWSLT 2011 TED Talks MT tasks. For LM filtering, we achieve strong perplexity improvements which carry over to the translation quality with improvements up to +0.4% BLEU. For TM filtering, the combined method achieves small but consistent improvements over the standalone methods. As a side effect of adaptation via filtering, the fully fledged SMT system vocabulary size and phrase table size are reduced by a factor of at least 2 while up to +0.6% BLEU improvement is observed.</abstract>
<identifier type="citekey">mansour-etal-2011-combining</identifier>
<location>
<url>https://aclanthology.org/2011.iwslt-papers.5</url>
</location>
<part>
<date>2011-dec 8-9</date>
<extent unit="page">
<start>222</start>
<end>229</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combining translation and language model scoring for domain-specific data filtering
%A Mansour, Saab
%A Wuebker, Joern
%A Ney, Hermann
%Y Federico, Marcello
%Y Hwang, Mei-Yuh
%Y Rödder, Margit
%Y Stüker, Sebastian
%S Proceedings of the 8th International Workshop on Spoken Language Translation: Papers
%D 2011
%8 dec 8 9
%C San Francisco, California
%F mansour-etal-2011-combining
%X The increasing popularity of statistical machine translation (SMT) systems is introducing new domains of translation that need to be tackled. As many resources are already available, domain adaptation methods can be applied to utilize these recourses in the most beneficial way for the new domain. We explore adaptation via filtering, using the crossentropy scores to discard irrelevant sentences. We focus on filtering for two important components of an SMT system, namely the language model (LM) and the translation model (TM). Previous work has already applied LM cross-entropy based scoring for filtering. We argue that LM cross-entropy might be appropriate for LM filtering, but not as much for TM filtering. We develop a novel filtering approach based on a combined TM and LM cross-entropy scores. We experiment with two large-scale translation tasks, the Arabic-to-English and English-to-French IWSLT 2011 TED Talks MT tasks. For LM filtering, we achieve strong perplexity improvements which carry over to the translation quality with improvements up to +0.4% BLEU. For TM filtering, the combined method achieves small but consistent improvements over the standalone methods. As a side effect of adaptation via filtering, the fully fledged SMT system vocabulary size and phrase table size are reduced by a factor of at least 2 while up to +0.6% BLEU improvement is observed.
%U https://aclanthology.org/2011.iwslt-papers.5
%P 222-229
Markdown (Informal)
[Combining translation and language model scoring for domain-specific data filtering](https://aclanthology.org/2011.iwslt-papers.5) (Mansour et al., IWSLT 2011)
ACL