@inproceedings{eck-etal-2014-extracting,
title = "Extracting translation pairs from social network content",
author = "Eck, Matthias and
Zemlyanskiy, Yuri and
Zhang, Joy and
Waibel, Alex",
editor = {Federico, Marcello and
St{\"u}ker, Sebastian and
Yvon, Fran{\c{c}}ois},
booktitle = "Proceedings of the 11th International Workshop on Spoken Language Translation: Papers",
month = dec # " 4-5",
year = "2014",
address = "Lake Tahoe, California",
url = "https://aclanthology.org/2014.iwslt-papers.7",
pages = "200--205",
abstract = "We introduce two methods to collect additional training data for statistical machine translation systems from public social network content. The first method identifies multilingual content where the author self-translated their own post to reach additional friends, fans or customers. Once identified, we can split the post in the language segments and extract translation pairs from this content. The second methods considers web links (URLs) that users add as part of their post to point the reader to a video, article or website. If the same URL is shared from different language users, there is a chance they might give the same comment in their respective language. We use a support vector machine (SVM) as a classifier to identify true translations from all candidate pairs. We collected additional translation pairs using both methods for the language pairs Spanish-English and Portuguese-English. Testing the collected data as additional training data for statistical machine translations on in-domain test sets resulted in very significant improvements of up to 5 BLEU.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="eck-etal-2014-extracting">
<titleInfo>
<title>Extracting translation pairs from social network content</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Eck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Zemlyanskiy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joy</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Waibel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-dec 4-5</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th International Workshop on Spoken Language Translation: Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcello</namePart>
<namePart type="family">Federico</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Stüker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">François</namePart>
<namePart type="family">Yvon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<place>
<placeTerm type="text">Lake Tahoe, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce two methods to collect additional training data for statistical machine translation systems from public social network content. The first method identifies multilingual content where the author self-translated their own post to reach additional friends, fans or customers. Once identified, we can split the post in the language segments and extract translation pairs from this content. The second methods considers web links (URLs) that users add as part of their post to point the reader to a video, article or website. If the same URL is shared from different language users, there is a chance they might give the same comment in their respective language. We use a support vector machine (SVM) as a classifier to identify true translations from all candidate pairs. We collected additional translation pairs using both methods for the language pairs Spanish-English and Portuguese-English. Testing the collected data as additional training data for statistical machine translations on in-domain test sets resulted in very significant improvements of up to 5 BLEU.</abstract>
<identifier type="citekey">eck-etal-2014-extracting</identifier>
<location>
<url>https://aclanthology.org/2014.iwslt-papers.7</url>
</location>
<part>
<date>2014-dec 4-5</date>
<extent unit="page">
<start>200</start>
<end>205</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Extracting translation pairs from social network content
%A Eck, Matthias
%A Zemlyanskiy, Yuri
%A Zhang, Joy
%A Waibel, Alex
%Y Federico, Marcello
%Y Stüker, Sebastian
%Y Yvon, François
%S Proceedings of the 11th International Workshop on Spoken Language Translation: Papers
%D 2014
%8 dec 4 5
%C Lake Tahoe, California
%F eck-etal-2014-extracting
%X We introduce two methods to collect additional training data for statistical machine translation systems from public social network content. The first method identifies multilingual content where the author self-translated their own post to reach additional friends, fans or customers. Once identified, we can split the post in the language segments and extract translation pairs from this content. The second methods considers web links (URLs) that users add as part of their post to point the reader to a video, article or website. If the same URL is shared from different language users, there is a chance they might give the same comment in their respective language. We use a support vector machine (SVM) as a classifier to identify true translations from all candidate pairs. We collected additional translation pairs using both methods for the language pairs Spanish-English and Portuguese-English. Testing the collected data as additional training data for statistical machine translations on in-domain test sets resulted in very significant improvements of up to 5 BLEU.
%U https://aclanthology.org/2014.iwslt-papers.7
%P 200-205
Markdown (Informal)
[Extracting translation pairs from social network content](https://aclanthology.org/2014.iwslt-papers.7) (Eck et al., IWSLT 2014)
ACL