<?xml version="1.0" encoding="UTF-8"?>
<algorithms version="110505">
<algorithm name="SectLabel" version="110505">
<variant no="0" confidence="0.000015">
<title confidence="0.9986615">
Five Shades of Noise:
Analyzing Machine Translation Errors in User-Generated Text
</title>
<author confidence="0.927547">
Marlies van der Wees Arianna Bisazza Christof Monz
</author>
<affiliation confidence="0.97392">
Informatics Institute, University of Amsterdam
</affiliation>
<email confidence="0.995951">
{m.e.vanderwees,a.bisazza,c.monz}@uva.nl
</email>
<sectionHeader confidence="0.99382" genericHeader="abstract">
Abstract
</sectionHeader>
<bodyText confidence="0.999959944444445">
It is widely accepted that translating user-
generated (UG) text is a difficult task
for modern statistical machine translation
(SMT) systems. The translation quality
metrics typically used in the SMT litera-
ture reflect the overall quality of the sys-
tem output but provide little insight into
what exactly makes UG text translation
difficult. This paper analyzes in detail
the behavior of a state-of-the-art SMT sys-
tem on five different types of informal
text. The results help to demystify the
poor SMT performance experienced by re-
searchers who use SMT as an intermedi-
ate step of their UG-NLP pipeline, and to
identify translation modeling aspects that
the SMT community should more urgently
address to improve translation of UG data.
</bodyText>
<sectionHeader confidence="0.998991" genericHeader="introduction">
1 Introduction
</sectionHeader>
<bodyText confidence="0.983235578947368">
User-generated (UG) text such as found on social
media and web forums poses different challenges
to statistical machine translation (SMT) than for-
mal text. This is reflected by poor translation qual-
ity for informal genres (see for example Figure 1),
which is typically measured with automatic qual-
ity metrics such as BLEU (Papineni et al., 2002),
METEOR (Banerjee and Lavie, 2005), or TER
(Snover et al., 2006). These scores alone, however,
only reflect the overall translation quality, and do
not provide any insight in what exactly makes
translating UG text hard. While such knowledge
is crucial for improving SMT of UG text, surpris-
ingly little work on error analysis for SMT of user-
generated text has been reported.
Moreover, the notion of user-generated content
ﺯﺗﻣﻝﺎﻳﻌﻟﺍﻥﺎﺷﻋﺕﻟﺎﻗ
she said so the kids do not feel upset
she said because of the sons
</bodyText>
<figure confidence="0.535249666666667">
In (Chinese): 你 路上 慢 点
Reference: take your time
MT output: you are on the road to slow points
</figure>
<figureCaption confidence="0.999334">
Figure 1: SMS examples with poor SMT output.
</figureCaption>
<bodyText confidence="0.999932178571429">
only partially specifies the exact nature of docu-
ments. What all documents that can be classified
as being UG have in common is the fact that they
have been written by a lay-person, as opposed to
a journalist or professional author, and that they
have not undergone any editorial control. UG
text also tends to express the writer’s opinion to
a larger degree than news articles which generally
strive for balance and nuance. Within UG text, we
can distinguish several subclasses, including (i)
message and dialog-oriented content such as short
message service (SMS) texts, Internet chat mes-
sages, and transcripts of conversational speech, (ii)
commentaries to news articles, often expressing an
opinion about the corresponding articles and relat-
ing the content to the reader’s situation, and (iii)
weblogs, which can bear some resemblance to ed-
itorial pieces published by news organizations.
While UG text processing tasks are becoming
more and more common, the research in SMT is
still mostly driven by formal translation tasks1,
and existing error analysis approaches are only
partially useful for UG. In this work, we conduct a
series of analyses on five different UG benchmark
sets for two language pairs, Arabic-English and
Chinese-English, with the goals of (i) explaining
the typically poor SMT performance observed for
UG texts, and (ii) identifying translation modeling
</bodyText>
<footnote confidence="0.864708">
1One of the very few exceptions is NIST OpenMT 2015,
which focusses entirely on translating informal genres.
</footnote>
<figure confidence="0.429043">
In (Arabic):
Reference:
MT output:
ﺵﻠﻋ
</figure>
<page confidence="0.987331">
28
</page>
<note confidence="0.9884205">
Proceedings of the ACL 2015 Workshop on Noisy User-generated Text, pages 28–37,
Beijing, China, July 31, 2015. c�2015 Association for Computational Linguistics
</note>
<bodyText confidence="0.999884">
aspects that should be addressed to improve trans-
lation of UG data. We not only contrast our obser-
vations with two news data sets, but we also show
that SMT quality can vary significantly across dif-
ferent types of UG content, and that different UG
types exhibit dissimilar error distributions. Specif-
ically, we summarize our main findings as follows:
</bodyText>
<listItem confidence="0.994565111111111">
• The SMS and chat benchmarks are the most
distant from formal text at all the analyzed
levels. Errors in other types of UG are often
more similar to news errors than to those in
SMS and chat messages.
• SMT model coverage dramatically deterio-
rates for phrases of length 3 or longer in most
of the UG benchmarks.
• Errors due to out-of-vocabulary (OOV)
</listItem>
<bodyText confidence="0.804721">
words in the source text substantially in-
crease in number for UG data sets, but are
considerably less common than errors due to
source-target OOVs, i.e., phrase pairs that are
not covered by the SMT models.
</bodyText>
<sectionHeader confidence="0.999668" genericHeader="related work">
2 Related Work
</sectionHeader>
<bodyText confidence="0.99955775">
Identifying and analyzing different types of SMT
errors is an essential step towards the development
of translation approaches that can achieve more ro-
bust performance, and has been the focus of earlier
work. Popovi´c and Ney (2011), for example, com-
bine word error rates with morpho-syntactic infor-
mation to classify errors into five categories; in-
flectional errors, reordering errors, lexical errors,
word deletions, and word insertions. Irvine et al.
(2013) use word alignment links to quantify in-
correct lexical choices, and determine how such
errors change when shifting domains. Other work
</bodyText>
<table confidence="0.983344111111111">
Genre Dev set Test set Refs
Lines Tokens Lines Tokens
SMS 2.7K 23.3K 7.6K 44.9K 1
Chat 3.5K 22.5K 7.1K 44.5K 1
CTS 2.4K 23.1K 3.6K 40.6K 1
Comments 1.1K 25.8K 1.7K 45.5K 1
Weblogs 0.8K 14.6K 1.3K 39.9K 4
News 1 1.0K 26.9K 1.6K 46.3K 1
News 2 1.0K 34.4K 1.4K 46.6K 4
</table>
<tableCaption confidence="0.998543">
Table 1: Statistics of the Arabic-English UG (top)
</tableCaption>
<bodyText confidence="0.972900866666667">
and contrastive news (bottom) evaluation sets. To-
kens are counted on the Arabic side.
on SMT error analysis studies the effect of domain
adaptation on SMT, for example by examining in
which stage of the SMT pipeline the available in-
domain data can best be used (Duh et al., 2010),
or whether it is more promising to improve either
phrase extraction or scoring (Bisazza et al., 2011;
Haddow and Koehn, 2012).
The vast majority of SMT research, includ-
ing the above described work on error analysis,
is evaluated on data containing formal language.
Work on SMT of informal text mostly targets re-
duction of OOV words in the source text, for ex-
ample by correcting spelling errors (Bertoldi et al.,
2010), normalizing noisy text to more formal text
(Banerjee et al., 2012; Ling et al., 2013a), or en-
hancing the training data with bilingual segments
extracted from Twitter (Jehl et al., 2012; Ling et
al., 2013b). Other work improves SMT of UG
text by combining statistical and rule-based MT
(Carrera et al., 2009), or models trained on for-
mal and informal data (Banerjee et al., 2011). Fi-
nally, Roturier and Bensadoun (2011) conduct a
comparative study to determine the ability of sev-
eral SMT systems to translate UG text, but they do
not examine what errors the systems make. To our
knowledge, our work is the first that looks inside
an SMT system to systematically inspect its be-
havior across a diverse spectrum of UG text types.
</bodyText>
<sectionHeader confidence="0.996914" genericHeader="method">
3 Experimental setup
</sectionHeader>
<bodyText confidence="0.9994325">
We perform our error analysis on two language
pairs, Arabic-English and Chinese-English.
</bodyText>
<subsectionHeader confidence="0.999209">
3.1 Evaluation sets
</subsectionHeader>
<bodyText confidence="0.98950375">
For both language pairs we use evaluation sets
for five types of user-generated text: SMS mes-
sages, chat messages, manual transcripts of phone
conversations (called Conversational Telephone
</bodyText>
<table confidence="0.992455666666667">
Genre Dev set Test set Refs
Lines Tokens Lines Tokens
SMS 1.8K 15.3K 4.2K 36.3K 1
Chat 4.0K 25.6K 6.0K 45.7K 1
CTS 2.2K 25.1K 2.9K 44.8K 1
Comments 1.0K 26.5K 1.5K 41.0K 1
Weblogs 0.5K 8.8K 0.7K 14.4K 4
News 1 0.8K 24.5K 1.5K 41.9K 1
News 2 1.2K 29.4K 0.7K 17.7K 4
</table>
<tableCaption confidence="0.972562333333333">
Table 2: Statistics of the Chinese-English UG
(top) and contrastive news (bottom) evaluation
sets. Tokens are counted on the Chinese side.
</tableCaption>
<page confidence="0.993112">
29
</page>
<figure confidence="0.998943708333333">
BLEU (1 reference)
BLEU (1 reference)
30
Translation performance of Arabic-English benchmarks
35
30
25
20
15
10
5
0 News 1 News 2 Weblogs Comments CTS Chat SMS
Translation performance of Chinese-English benchmarks
35
Online
In-house
25
20
15
10
5
0 News 1 News 2 Weblogs Comments CTS Chat SMS
Online
In-house
</figure>
<figureCaption confidence="0.9858225">
Figure 2: Translation performance of baseline experiments for various Arabic-English (left) and Chinese-
English (right) data sets, measured in case-insensitive BLEU for one reference translation.
</figureCaption>
<bodyText confidence="0.999063904761905">
Speech (CTS)), weblogs, and readers’ comments
to news articles. The first four data sets orig-
inate from BOLT and NIST OpenMT, and are
distributed by the Linguistic Data Consortium
(LDC), while the last data set is crawled from the
web. All UG experiments are contrasted with two
news data sets; the news portions of NIST evalua-
tion sets, and web-crawled news articles.
For Arabic-English, the web-crawled news arti-
cles and comments originate from the Gen&amp;Topic
data set (van der Wees et al., 2015), in which both
genres cover the same distributions over various
topics. Consequently, any observed differences
between the news and UG portions of this data set
can be entirely attributed to genre differences and
not to potential topical variation.
We have created similar-sized benchmark sets
as much as possible, however sometimes limited
by availability. Tables 1 and 2 show the data
specifications of the Arabic-English and Chinese-
English evaluation sets, respectively.2
</bodyText>
<subsectionHeader confidence="0.99976">
3.2 SMT systems
</subsectionHeader>
<bodyText confidence="0.989908235294118">
All experiments presented in this paper are per-
formed with our in-house state-of-the-art system
based on phrase-based SMT and similar to Moses
(Koehn et al., 2007). Our Arabic-English system
is built from 1.75M lines (52.9M source tokens)
of parallel text, and our Chinese-English system
from 3.13M lines (55.4M source tokens) of paral-
lel text. We tokenize all Arabic data using MADA
(Habash and Rambow, 2005), ATB scheme, and
we segment the Chinese data following Tseng et
al. (2005). Both systems use an adapted 5-gram
English language model that linearly interpolates
different English Gigaword subcorpora with the
2Note that two evaluation sets contain four reference
translations instead of one. To allow for fair comparison, we
average the scores of the four references in all our analyses.
English side of our bitexts, containing both news
and UG data.
While parallel data is scarce in general, the sit-
uation is much worse for UG data, where there
are hardly any sizable parallel corpora for any
language pair. As a consequence, the training
data of both systems comprises 70-75% news data,
mostly LDC-distributed, and 25-30% data in var-
ious other genres (weblogs, comments, editori-
als, speech transcripts, and small amounts of chat
data), mostly harvested from the web. Per lan-
guage pair, all experiments use the same SMT
models, but we tune parameters separately for
each benchmark set using pairwise ranking opti-
mization (PRO) (Hopkins and May, 2011).
To put the results of our system into perspective,
we also run a first series of experiments on a well-
known and established online SMT system.
</bodyText>
<sectionHeader confidence="0.943613" genericHeader="method">
4 Error analysis and results
</sectionHeader>
<bodyText confidence="0.987708666666667">
We perform four series of experiments, each with
the goal of answering different questions about
SMT for UG text:
</bodyText>
<listItem confidence="0.988613666666667">
1. How large is the gap in translation quality be-
tween news and different types of UG data?
(§4.1). To answer this question, we measure
the BLEU score of two state-of-the-art SMT
system outputs on all our data sets.
2. What kind of translation choices does the
SMT system make for UG data? To answer
this question, we measure phrase lengths
used during the translation (or decoding) pro-
cess (§4.2).
3. What translation choices could have been
made by the SMT system? To answer this
</listItem>
<bodyText confidence="0.6157485">
question, we compute mono- and bilingual
coverage of the SMT models (§4.3).
</bodyText>
<page confidence="0.97096">
30
</page>
<figure confidence="0.99931155">
Average phrase length (#tokens)
2.0
1.5
1.0
0.5
0.0
Average phrase length (#tokens)
2.0
1.5
1.0
0.5
0.0
Decoding phrase lengths for Arabic-English benchmarks
News 1 News 2 Weblogs Comments CTS Chat SMS
Decoding phrase lengths for Chinese-English benchmarks
Source
Target
News 1 News 2 Weblogs Comments CTS Chat SMS
Source
Target
</figure>
<figureCaption confidence="0.999896">
Figure 3: Average source-side and target-side phrase lengths used during decoding.
</figureCaption>
<bodyText confidence="0.998766428571429">
4. Why did the SMT system make the transla-
tion choices that it made? What errors are ob-
served for each benchmark, and how often?
To answer these questions, we reimplement
the word-alignment driven error analysis ap-
proach by Irvine et al. (2013) and perform a
qualitative analysis on the results (§4.4).
</bodyText>
<subsectionHeader confidence="0.982634">
4.1 Overall translation quality
</subsectionHeader>
<bodyText confidence="0.999980290322581">
A first important indication of SMT quality across
different genres can be given by translation quality
measures that are based on the similarity between
the SMT output and a reference human translation.
To estimate the gap in translation quality between
news and UG text, but also among various types
of UG text, we measure the BLEU scores (1 refer-
ence) of our in-house SMT system and that of the
online system on all our evaluation sets.
The results in Figure 2 (left) show that trans-
lation quality differs greatly between the Arabic-
English data sets. In particular, the News 1 data
set (from NIST) yields considerably higher BLEU
scores than all other evaluation sets, including the
News 2 (web-crawled) set, which represents the
same genre but is visibly more difficult to trans-
late. On the other end of the spectrum, we see
that translation quality of the SMS and chat data
sets is very poor. Note that our in-house system is
optimized per genre, whereas the online system is
optimized for general language and speed.
For Chinese-English (Figure 2, right) the differ-
ences in BLEU are less pronounced, both across
the different data sets and between the two SMT
systems. Still, translation quality is worse for the
UG data sets than for news, indicating that also
for this language pair translating UG text is more
challenging than translating news.
As all subsequent analyses require system-
internal information, we carry out the experiments
with our in-house system only.
</bodyText>
<subsectionHeader confidence="0.996352">
4.2 Translation phrase length analysis
</subsectionHeader>
<bodyText confidence="0.999953814814815">
Most state-of-the-art SMT systems, including our
in-house system, are phrase-based, with transla-
tions being generated phrase by phrase rather than
word by word (Koehn et al., 2003). An abundant
use of small phrases during decoding indicates that
the system is not taking advantage of the model’s
ability to memorize large contextual and possi-
bly non-compositional translation blocks. It is
therefore interesting to measure the average phrase
length (i.e., number of tokens) used by the system,
for the source as well as the target language (Fig-
ure 3). For Arabic-English we see that source-side
phrases are noticeably longer for both news bench-
marks than for the UG data sets. The average
target-side phrase length, on the other hand, shows
less correlation with the genres of the data sets.
Similar trends are observed for Chinese-English,
however differences are less extreme.
In general, SMT systems incur higher model
costs when utilizing many small phrases rather
than few large phrases. If, in spite of that, a sys-
tem selects many short phrases, which is the case
for most of our UG benchmarks, this can be due
to (i) unreliable translation probabilities or (ii) to
the mere lack of correct translation options in the
models. We investigate both issues in the follow-
ing analyses.
</bodyText>
<subsectionHeader confidence="0.999037">
4.3 Model coverage analysis
</subsectionHeader>
<bodyText confidence="0.999916333333333">
Next, we examine the translation model coverage
for each data set, which tells us what phrases the
system could have used for decoding. For each of
our test sets, we create automatic word alignments
using GIZA++ (Och and Ney, 2003), and extract
from these the set of all reference phrase pairs us-
ing Moses’ phrase extraction algorithm (Koehn et
al., 2007). By comparing this set of phrase pairs
to the available phrases in the SMT models, which
</bodyText>
<page confidence="0.999879">
31
</page>
<tableCaption confidence="0.866768333333333">
Table 3: Target language model perplexity and translation model coverage of Arabic-English bench-
marks. Phrase pair recall values are broken down by source phrase length. Intensities of the cell colors
indicate relative recall values with respect to the best scoring benchmark (measured in BLEU).
</tableCaption>
<table confidence="0.970895421052631">
Source phrase recall
Genre BLEU LM PP 1 2 3 4
News 1 33.8 65 99.7 88.9 56.3 26.1
News 2 21.5 86 99.6 88.1 53.7 21.8
Weblogs 22.3 152 99.2 80.5 40.6 13.5
Comments 17.2 117 97.7 80.2 43.0 15.3
CTS 16.0 103 97.4 66.3 25.1 6.4
Chat 10.0 179 94.1 56.0 19.4 4.7
SMS 8.8 196 93.7 57.8 17.5 3.3
Target phrase recall Phrase pair recall
3 4 1 2 3 4
61.5 29.6 84.9 54.4 23.6 8.1
53.4 23.6 77.4 46.9 18.8 5.9
48.9 17.8 78.4 41.5 12.9 2.9
55.3 21.9 59.1 33.2 11.1 2.8
54.3 21.5 66.7 25.7 6.1 1.0
47.3 16.7 60.8 21.3 4.5 0.8
47.0 21.1
14.6 62.0 3.7 0.4
</table>
<figure confidence="0.96132825">
1 2
99.7 91.1
88.1
99.5
99.7
99.8 90.8
98.6
99.1
86.3
89.8
86.1
86.3
99.5
99.4
83.8
44.2
14.7
63.1
32.4
10.7 3.3
</figure>
<table confidence="0.997203941176471">
Source phrase recall
Genre BLEU LM PP 1 2 3 4
News 1 17.2 121 99.0 80.2 40.8 16.2
News 2 15.4 118 98.8 84.2 44.3 16.0
Weblogs 11.8 153 98.6 76.6 33.8 11.1
Comments 11.1 195 98.7 78.3 35.2 8.7
CTS 12.5 135 98.7 80.7 40.1 10.5
Chat 9.9 221 98.0 71.9 27.5 6.1
SMS 10.7 234 97.3 68.5 24.9 4.8
Target phrase recall Phrase pair recall
1 2 3 4 1 2 3 4
99.5 84.9 48.0 19.5 69.1 34.8 10.8 3.3
99.3 81.6 40.8 12.4 59.0 27.0 7.3 1.7
97.9 77.9 35.1 10.2 53.5 21.6 5.0 1.0
99.8 86.3 47.4 16.4 70.0 33.5 9.3 1.7
99.4 82.6 43.2 13.0 62.3 24.8 5.4 0.6
99.0 80.4 40.5 12.5 62.6 24.6 5.1 0.5
</table>
<tableCaption confidence="0.8993355">
Table 4: Target language model perplexity and translation model coverage of Chinese-English bench-
marks. See Table 3 for explanation on colors and categories.
</tableCaption>
<bodyText confidence="0.975447">
have been extracted using the same procedure, we
can compute the following statistics:
</bodyText>
<listItem confidence="0.993990333333333">
1. Source phrase recall, defined as the fraction
of reference phrase pairs whose source side
is found in the SMT models.
2. Target phrase recall, defined as the fraction
of reference phrase pairs whose target side is
found in the SMT models.
3. Phrase pair recall, defined as the fraction of
reference phrase pairs whose source and tar-
get side are jointly found in the SMT models.
</listItem>
<bodyText confidence="0.99991393939394">
Low recall values indicate that the models lack
phrases or phrase pairs that match the test data,
which can be addressed by adding additional rele-
vant training data or by generating new phrases. In
addition, we measure language model perplexity
as an indication of how predictable each bench-
mark is for the language model. Note that high
perplexity corresponds to lower coverage.
The model coverage results for Arabic-English
and Chinese-English are shown in Tables 3 and 4,
respectively. All recall scores are broken down by
phrase length, up to phrases of four tokens.3 We
use cell color intensity to represent relative recall
values with respect to the best scoring benchmark
according to BLEU, i.e., News 1. The results show
that source phrase recall is substantially lower for
the UG benchmarks than for news, particularly for
longer phrases. Regarding target phrase recall,
differences between various data sets and genres
are much smaller. This suggests that many of the
reference phrases could potentially be generated
by the system, even for the UG data. However,
to be able to output the available target phrases,
the system needs a match with the input source
phrases, which is exactly what is being measured
with phrase pair recall. Here, we see that for the
majority of single-word source phrases, the ex-
pected target phrase is accessible by the system.
For longer phrases, though, there is again a drastic
decline in recall, with almost no phrases of length
4 or longer having the expected target covered by
the models. Similar to source phrase recall, this
decline is notably bigger for UG than for news.
</bodyText>
<footnote confidence="0.965534">
3The source-target phrase pair recall (last four columns) is
split by source phrase length rather than target phrase length
since source phrases are the actual input to the SMT system.
</footnote>
<page confidence="0.999201">
32
</page>
<bodyText confidence="0.99997844">
Looking at the differences between the various
types of UG data, we see that the SMS and chat
benchmarks are most severely affected by over-
all poor model coverage. As for weblogs, the
target phrase recall is similar to SMS and chat,
whereas both source phrase and phrase pair recall
are much higher. For CTS and web comments,
there are notable differences between model cov-
erage for the two language pairs, despite similar
BLEU scores. While comments have better cover-
age in the Arabic-English models, CTS has higher
recall values for Chinese-English.
Finally, we see that language model perplexity
is on average lower for Arabic-English than for
the Chinese-English benchmarks. This is some-
what surprising given that perplexity is measured
on the English side, but it can partially explain the
low BLEU scores on, for example, the Chinese-
English News 1 benchmark. All news benchmarks
have relatively low perplexities, which is expected
since the language model covers more news than
UG data. Of the UG benchmarks, CTS has a
remarkably low perplexity value, suggesting that
for this genre the language model can potentially
compensate for low translation model coverage.
</bodyText>
<subsectionHeader confidence="0.580136">
4.4 WADE: Word Alignment Driven
Evaluation
</subsectionHeader>
<bodyText confidence="0.95188685">
Next, to gain a more fine-grained insight in why
our SMT system makes its translation choices, we
reimplement an evaluation approach proposed by
Irvine et al. (2013), which analyzes SMT error
types at the word alignment level. The analysis
exploits automatic word alignments between (i) a
given source sentence and its reference translation,
and (ii) the same source sentence and its automatic
translation. Each aligned source-reference word
pair is examined for whether the alignment link is
matched by the decoder. Formally, fi is a foreign
source phrase target phrase probability
Figure 4: Graphical overview of SEEN, SENSE and
SCORE errors in a toy phrase table.
word, ej is a reference word aligned to fi, ai,j is
the alignment link between fi and ej, and Hi is
the set of output words that are aligned to fi by
the decoder. If ej ∈ Hi, the alignment link ai,j is
marked as correct. Otherwise, ai,j is categorized
with one of the following error types:
</bodyText>
<listItem confidence="0.734645">
1. A SEEN error indicates an unseen source
</listItem>
<bodyText confidence="0.807397">
word, i.e., out-of-vocabulary (OOV) item.
This error is assigned to ai,j if fi does not
appear in the phrase table used for transla-
tion. This type of error inversely correlates
with length-1 source phrase recall (§4.3).
</bodyText>
<listItem confidence="0.985598">
2. A SENSE error indicates an unseen target
word. This error is assigned to ai,j if fi
does appear in the phrase table but never with
translation candidate ej.
3. A SCORE error indicates suboptimal scoring
of translation options. This error is assigned
</listItem>
<bodyText confidence="0.96345025">
to ai,j if fi exists in the phrase table with
translation candidate ej, but another transla-
tion candidate is preferred by the decoder.
Figure 4 shows a graphical representation of these
error types and their ‘location’ in the phrase table.
In addition to the listed error types, Irvine et al.
define SEARCH errors as errors due to pruning in
beam search, and refer to the complete set of errors
as the S4 taxonomy. For this analysis, however,
SEARCH errors are indistinguishable from SCORE
errors, and are therefore never assigned.
A final category that can be considered are free-
bies: OOVs that are copied over verbatim to the
output sentence and accidentally match the ref-
erence translation (e.g., urls, proper nouns, etc.).
For the language pairs that we study, they are very
rare; at most 0.35% for Arabic-English (in CTS)
and 0.63% for Chinese-English (in SMS). Manual
inspection reveals that nearly all freebies are En-
glish words in the foreign source text. Since they
are so rare, we omit freebies from our results.
As WADE errors are assigned at the fine-
grained level of individual words, this analysis al-
lows for (i) sentence-level visualization of errors,
and (ii) collecting aggregate statistics of each error
type for an entire evaluation set. By assembling
the latter for various benchmarks, we can quantify
global differences between genres or data sets. At
the same time, by examining (i) we can gain in-
sight in the nature of the different ‘errors’, which
might be real mistakes, or, for instance, different
lexical choices.
</bodyText>
<figure confidence="0.989202872727272">
ﻝ
ﺩﻣﺣﻟﺍ
Source phrase
not in phrase table:
ﻝ
SEEN error
ﻣﺣﻟﺍ
ﺩﻣﺣﻟﺍ
ﻝ
ﻱ
ﻱ
ﻲﺗﺑﻳﺑﺣ
ﻲﺗﺑﻳﺑﺣ
praise be to
praise for
thank
my dear
my love
0.4
0.3
Source and target
phrases both in table,
0.2 but other translation
preferred:
SCORE error
Target phrase
not in phrase table:
SENSE error
33
Word-level error statistics for Arabic-English benchmarks
60
Correct
Seen
Sense
Score
40
30
20
10
0 News 1 News 2 Weblogs Comments CTS Chat SMS
Relative frequency
50
Word-level error statistics for Chinese-English benchmarks
60
Correct
Seen
Sense
Score
10
0 News 1 News 2 Weblogs Comments CTS Chat SMS
50
Relative frequency
40
30
20
</figure>
<figureCaption confidence="0.999957">
Figure 5: Aggregate error statistics for Arabic-English (left) and Chinese-English (right) benchmark sets.
</figureCaption>
<bodyText confidence="0.999927076923077">
Quantitative results. The aggregate error statis-
tics for each data set are shown in Figure 5. To put
our results into perspective, we recall the findings
of Irvine et al. (2013). They find that for formal
domains using a French-English system, 50–60%
of the alignment links are correct, and SCORE er-
rors are more common than SENSE errors, which
in turn are more common than SEEN errors. While
we observe a similar distribution for our Arabic-
English news benchmarks, these numbers do not
generalize to the Arabic-English UG benchmarks
nor to any of the Chinese-English data sets.
First, the portion of SEEN errors increases dra-
matically for the Arabic-English UG translation
tasks. For Chinese-English this trend is less pro-
nounced yet also clearly observable. Next, SENSE
errors also increase substantially for most of the
UG data, making up the majority of the errors for
Chinese-English SMS and chat. This indicates
that a promising strategy for adapting SMT sys-
tems to translating UG data involves generating
new target-side translation candidates that match
the source phrases in the input sentences. Finally,
we evaluate the fraction of SCORE errors. While
this is the most commonly observed error type in
most of the data sets, there seems to be very lit-
tle correspondance with the genre or BLEU scores
of the benchmarks. This is an interesting finding
since most work in system adaptation for SMT fo-
cuses on better scoring of existing translation can-
didates (Matsoukas et al., 2009; Foster et al., 2010;
Axelrod et al., 2011; Chen et al., 2013, among oth-
ers). However, for UG translation tasks this does
not appear as the most profitable approach.
Qualitative results. The generated sentence-
level error annotations allow us to examine the var-
ious error types in detail. The first phenomenon
that we repeatedly observe in the UG data are
SEEN errors due to misspellings or, in the case of
Arabic, dialectal forms. Two such examples are
shown in Figures 6A and 6B: In the first, the SMT
system does not recognize the dialectal form of
verb negation ‘mtzEl$’, which is a morphologi-
cally complex word containing both a prefix and a
suffix. In the second, the input word ‘AlmwbAyl’
(‘mobile’) is wrongly spelled ‘AlmwyAyl’. It is
interesting to note that ‘b’ and ‘y’ are very similar
in the Arabic script. This type of errors is partic-
ularly frequent in chat and SMS, which can partly
explain the different distribution of errors across
the Arabic-English data sets (Figure 5).
Also frequently observed in the UG data are
SMT lexical choices that are more formal than the
reference translations. This is not surprising given
the large amount of formal data in the SMT mod-
els, but it does illustrate the need for adaptation
to UG data. Often, the optimal lexical choice is
simply absent from the SMT models, resulting in
SENSE errors. This can be observed in Figure 6A,
where ‘sons’ is output instead ‘kids’, and in Fig-
ure 6C, where ‘i understand’ is output instead of
the colloquial ‘i got it’. In other situations, the
annotated SCORE errors indicate that the correct
choice was available to the SMT system without
being selected for translation. For example in Fig-
ure 6D, the output ‘my parents’ is preferred to the
more colloquial ‘mom and dad’ in the reference.
Another phenomenon, particularly common for
Chinese-English UG translations, is that idioms
are translated in small chunks, thereby losing their
meaning as a phrase. In Figure 6D, the char-
acters ‘说’, ‘⼀’, and ‘声’ mean ‘to say’, ‘one’,
and ‘sound’, respectively. The phrase ‘说⼀声’ as
a whole means ‘talk a bit about something’ but
is not covered by the SMT models. Similarly,
‘你路上慢点’ in Figure 6E literally means ‘you on
the road slow a bit’, which, if covered by the mod-
els, could have been translated into ‘be careful on
</bodyText>
<page confidence="0.994335">
34
</page>
<figure confidence="0.996739576923077">
B) Arabic-English chat example
A) Arabic-English SMS example
C) Arabic-English CTS example
Ref: yeah ,i got it , yeah
Input: Ah
fhmt Ah
Output: , i understand ,
D) Chinese-English SMS example
Ref: can you tell mom and dad ?
Input:
爸妈 说 ⼀ 声
你 跟
Output: you talk to my parents said a voices
Output: she said because of the sons
Output: on the
Ref: she said so the kids do not feel upset
Ref: for the mobile phone
Input: qAlt E$An AlEyAl mtzEl$
Input: Ely
AlmwyAyl
Correct
Seen error
Sense error
Score error
E) Chinese-English SMS example
Output: on the internet , and you are on the road to slow points
</figure>
<figureCaption confidence="0.99911875">
Figure 6: Sentence-level error annotations from
various UG benchmarks illustrating common is-
sues in SMT of UG data. We use Buckwalter
transliteration to represent the Arabic source text.
</figureCaption>
<figure confidence="0.9963362">
Ref: i &apos;m online . take your time
Input:
±网
7 你
19± it A
</figure>
<bodyText confidence="0.99904452631579">
your way’ or ‘take your time’. These examples il-
lustrate that the low phrase pair recall for longer
phrases severely complicates SMT of UG data.
A final recurring issue in SMS and chat mes-
sages is the omission of first person pronouns, see
for example Figure 6E. The Chinese source phrase
‘上网了’ literally means ‘get online’ (+ auxiliary
word marking past tense). A native speaker un-
derstands that this concerns the sender, which is
reflected by a first person pronoun in the reference.
The SMT system, on the other hand, cannot infer
the subject of this phrase and instead generates a
translation without pronouns.
Other, less common, types of errors occurring in
the UG data are due to inconsistent segmentation
or tokenization of input text, which mostly affects
rare words, emoticons, and repeating punctation.
Finally, SEEN errors for named entities are overall
rare but occur in both news and UG benchmarks.
</bodyText>
<sectionHeader confidence="0.994727" genericHeader="conclusions">
5 Conclusions and future directions
</sectionHeader>
<bodyText confidence="0.9999492">
Translating user-generated (UG) text is a diffi-
cult task for SMT. To explain the poor transla-
tion quality observed for UG data, we have per-
formed a detailed error analysis on two language
pairs (Arabic-English and Chinese-English) and
five different types of UG data (SMS, chat, CTS,
weblogs, and comments). Our quantitative re-
sults show among others that (i) UG data is trans-
lated with shorter source phrases than news, (ii)
UG translation model coverage deteriorates sub-
stantially for longer phrases, and (iii) phrase-pair
OOVs pose a bigger challenge to UG translation
tasks than source OOVs. In our qualitative anal-
ysis we found that common issues in UG data in-
clude (i) OOVs due to misspellings or Arabic di-
alectal forms, (ii) lexical choices that do not reflect
colloquial formulations, (iii) phrasal idioms being
translated word by word, and (iv) omitted first per-
son pronouns in SMS and chat.
Finally, different types of UG exhibit dissimi-
lar error distributions, demanding diverse strate-
gies to improve SMT quality. For example, SMS
and chat data might benefit from text normaliza-
tion (Bertoldi et al., 2010; Yvon, 2010; Ling et
al., 2013a) or otherwise resolving source OOVs,
which also has been the main focus of previ-
ous work on SMT for UG. On the other hand,
while research in domain adaptation for SMT of-
ten aims at better scoring of existing translation
candidates, we have shown that for many UG tasks
the most promising direction involves increasing
phrase pair recall of the SMT models (i.e., re-
ducing phrase pair OOVs), for example by para-
phrasing (Callison-Burch et al., 2006) or transla-
tion synthesis (Irvine and Callison-Burch, 2014).
</bodyText>
<sectionHeader confidence="0.998296" genericHeader="acknowledgments">
Acknowledgments
</sectionHeader>
<bodyText confidence="0.999338">
This research was funded in part by the Nether-
lands Organization for Scientific Research (NWO)
under project number 639.022.213. We thank
Rachel Cotterill, Nigel Dewdney, and the anony-
mous reviewers for their valuable comments.
</bodyText>
<page confidence="0.998915">
35
</page>
<sectionHeader confidence="0.990212" genericHeader="references">
References
</sectionHeader>
<reference confidence="0.999858398148148">
Amittai Axelrod, Xiaodong He, and Jianfeng Gao.
2011. Domain adaptation via pseudo in-domain data
selection. In Proceedings of the 2011 Conference on
Empirical Methods in Natural Language Process-
ing, pages 355–362.
Satanjeev Banerjee and Alon Lavie. 2005. METEOR:
an automatic metric for MT evaluation with im-
proved correlation with human judgments. In Pro-
ceedings of the ACL Workshop on Intrinsic and Ex-
trinsic Evaluation Measures for Machine Transla-
tion, pages 65–72.
Pratyush Banerjee, Sudip Kumar Naskar, Johann Ro-
turier, Andy Way, and Josef van Genabith. 2011.
Domain adaptation in statistical machine translation
of user-forum data using component level mixture
modelling. In Proceedings of the XIII Machine
Translation Summit, pages 285–292.
Pratyush Banerjee, Sudip Kumar Naskar, Johann Ro-
turier, Andy Way, and Josef van Genabith. 2012.
Domain adaptation in SMT of user-generated forum
content guided by OOV word reduction: Normal-
ization and/or supplementary data. In Proceedings
of the 16th Conference of the European Association
for Machine Translation, pages 169–176.
Nicola Bertoldi, Mauro Cettolo, and Marcello Fed-
erico. 2010. Statistical machine translation of texts
with misspelled words. In Human Language Tech-
nologies: The 2010 Annual Conference of the North
American Chapter of the Association for Computa-
tional Linguistics, pages 412–419.
Arianna Bisazza, Nick Ruiz, and Marcello Federico.
2011. Fill-up versus interpolation methods for
phrase-based SMT adaptation. In Proceedings of
the 8th International Workshop on Spoken Language
Translation, pages 136–143.
Chris Callison-Burch, Philipp Koehn, and Miles Os-
borne. 2006. Improved statistical machine trans-
lation using paraphrases. In Proceedings of the
Human Language Technology Conference of the
NAACL, Main Conference, pages 17–24.
Jordi Carrera, Olga Beregovaya, and Alex Yanishevsky.
2009. Machine translation for cross-language social
media.
Boxing Chen, Roland Kuhn, and George Foster. 2013.
Vector space model for adaptation in statistical ma-
chine translation. In Proceedings of the 51st Annual
Meeting of the Association for Computational Lin-
guistics, pages 1285–1293.
Kevin Duh, Katsuhito Sudoh, and Hajime Tsukada.
2010. Analysis of translation model adaptation in
statistical machine translation. In Proceedings of
the 7th International Workshop on Spoken Language
Translation (IWSLT 2010), pages 243–250.
George Foster, Cyril Goutte, and Roland Kuhn. 2010.
Discriminative instance weighting for domain adap-
tation in statistical machine translation. In Proceed-
ings of the 2010 Conference on Empirical Methods
in Natural Language Processing, pages 451–459.
Nizar Habash and Owen Rambow. 2005. Arabic tok-
enization, part-of-speech tagging and morphological
disambiguation in one fell swoop. In Proceedings of
the 43rd Annual Meeting on Association for Compu-
tational Linguistics, pages 573–580.
Barry Haddow and Philipp Koehn. 2012. Analysing
the effect of out-of-domain data on SMT systems. In
Proceedings of the Seventh Workshop on Statistical
Machine Translation, pages 422–432.
Mark Hopkins and Jonathan May. 2011. Tuning as
ranking. In Proceedings of the 2011 Conference
on Empirical Methods in Natural Language Pro-
cessing, pages 1352–1362. Association for Compu-
tational Linguistics.
Ann Irvine and Chris Callison-Burch. 2014. Halluci-
nating phrase translations for low resource MT. In
Proceedings of the Eighteenth Conference on Com-
putational Natural Language Learning, pages 160–
170.
Ann Irvine, John Morgan, Marine Carpuat, Hal
Daum´e III, and Dragos Stefan Munteanu. 2013.
Measuring machine translation errors in new do-
mains. Transactions of the Association for Compu-
tational Linguistics, 1:429–440.
Laura Jehl, Felix Hieber, and Stefan Riezler. 2012.
Twitter translation using translation-based cross-
lingual retrieval. In Proceedings of the Seventh
Workshop on Statistical Machine Translation, pages
410–421.
Philipp Koehn, Franz Josef Och, and Daniel Marcu.
2003. Statistical phrase-based translation. In Pro-
ceedings of the 2003 Conference of the North Amer-
ican Chapter of the Association for Computational
Linguistics on Human Language Technology, pages
48–54.
Philipp Koehn, Hieu Hoang, Alexandra Birch, Chris
Callison-Burch, Marcello Federico, Nicola Bertoldi,
Brooke Cowan, Wade Shen, Christine Moran,
Richard Zens, Chris Dyer, Ondrej Bojar, Alexan-
dra Constantin, and Evan Herbst. 2007. Moses:
Open source toolkit for statistical machine transla-
tion. In Proceedings of the 45th Annual Meeting of
the Association for Computational Linguistics Com-
panion Volume Proceedings of the Demo and Poster
Sessions, pages 177–180.
Wang Ling, Chris Dyer, Alan W Black, and Isabel
Trancoso. 2013a. Paraphrasing 4 microblog nor-
malization. In Proceedings of the 2013 Conference
on Empirical Methods in Natural Language Pro-
cessing, pages 73–84.
</reference>
<page confidence="0.975314">
36
</page>
<reference confidence="0.999157695652174">
Wang Ling, Guang Xiang, Chris Dyer, Alan Black, and
Isabel Trancoso. 2013b. Microblogs as parallel cor-
pora. In Proceedings of the 51st Annual Meeting of
the Association for Computational Linguistics (Vol-
ume 1: Long Papers), pages 176–186.
Spyros Matsoukas, Antti-Veikko I. Rosti, and Bing
Zhang. 2009. Discriminative corpus weight estima-
tion for machine translation. In Proceedings of the
2009 Conference on Empirical Methods in Natural
Language Processing, pages 708–717.
Franz Josef Och and Hermann Ney. 2003. A sys-
tematic comparison of various statistical alignment
models. Computational Linguistics, 29(1):19–51.
Kishore Papineni, Salim Roukos, Todd Ward, and Wei-
Jing Zhu. 2002. BLEU: a method for automatic
evaluation of machine translation. In Proceedings
of the 40th Annual Meeting of the Association for
Computational Linguistics, pages 311–318.
Maja Popovi´c and Hermann Ney. 2011. Towards au-
tomatic error analysis of machine translation output.
Computational Linguistics, 37(4):657–688.
Johann Roturier and Anthony Bensadoun. 2011. Eval-
uation of MT systems to translate user generated
content. In Proceedings of the XIII Machine Trans-
lation Summit, pages 244–251.
Matthew Snover, Bonnie Dorr, Richard Schwartz, Lin-
nea Micciulla, and John Makhoul. 2006. A study of
translation edit rate with targeted human annotation.
In Proceedings of the Seventh Conference of the As-
sociation for Machine Translation in the Americas,
pages 223–231.
Huihsin Tseng, Pichuan Chang, Galen Andrew, Daniel
Jurafsky, and Christopher Manning. 2005. A con-
ditional random field word segmenter. In Proceed-
ings of the fourth SIGHAN workshop on Chinese
language Processing, volume 171, pages 168–171.
Marlies van der Wees, Arianna Bisazza, Wouter
Weerkamp, and Christof Monz. 2015. What’s in
a domain? Analyzing genre and topic differences in
statistical machine translation. In Proceedings of the
Joint Conference of the 53th Annual Meeting of the
ACL and the 7th International Joint Conference on
Natural Language Processing of the AFNLP.
Franc¸ois Yvon. 2010. Rewriting the orthography of
SMS messages. Natural Language Engineering,
16(2):133–159.
</reference>
<page confidence="0.999606">
37
</page>
</variant>
</algorithm>
<algorithm name="ParsHed" version="110505">
<variant no="0" confidence="0.904204">
<title confidence="0.997253">Five Shades of Noise: Analyzing Machine Translation Errors in User-Generated Text</title>
<author confidence="0.966602">Marlies van_der_Wees Arianna Bisazza Christof Monz</author>
<affiliation confidence="0.96968">Informatics Institute, University of</affiliation>
<abstract confidence="0.997678631578947">It is widely accepted that translating usergenerated (UG) text is a difficult task for modern statistical machine translation (SMT) systems. The translation quality metrics typically used in the SMT literature reflect the overall quality of the system output but provide little insight into what exactly makes UG text translation difficult. This paper analyzes in detail the behavior of a state-of-the-art SMT system on five different types of informal text. The results help to demystify the poor SMT performance experienced by researchers who use SMT as an intermediate step of their UG-NLP pipeline, and to identify translation modeling aspects that the SMT community should more urgently address to improve translation of UG data.</abstract>
</variant>
</algorithm>
<algorithm name="ParsCit" version="110505">
<citationList>
<citation valid="true">
<authors>
<author>Amittai Axelrod</author>
<author>Xiaodong He</author>
<author>Jianfeng Gao</author>
</authors>
<title>Domain adaptation via pseudo in-domain data selection.</title>
<date>2011</date>
<booktitle>In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>355--362</pages>
<contexts>
<context position="25918" citStr="Axelrod et al., 2011" startWordPosition="4320" endWordPosition="4323">icates that a promising strategy for adapting SMT systems to translating UG data involves generating new target-side translation candidates that match the source phrases in the input sentences. Finally, we evaluate the fraction of SCORE errors. While this is the most commonly observed error type in most of the data sets, there seems to be very little correspondance with the genre or BLEU scores of the benchmarks. This is an interesting finding since most work in system adaptation for SMT focuses on better scoring of existing translation candidates (Matsoukas et al., 2009; Foster et al., 2010; Axelrod et al., 2011; Chen et al., 2013, among others). However, for UG translation tasks this does not appear as the most profitable approach. Qualitative results. The generated sentencelevel error annotations allow us to examine the various error types in detail. The first phenomenon that we repeatedly observe in the UG data are SEEN errors due to misspellings or, in the case of Arabic, dialectal forms. Two such examples are shown in Figures 6A and 6B: In the first, the SMT system does not recognize the dialectal form of verb negation ‘mtzEl$’, which is a morphologically complex word containing both a prefix an</context>
</contexts>
<marker>Axelrod, He, Gao, 2011</marker>
<rawString>Amittai Axelrod, Xiaodong He, and Jianfeng Gao. 2011. Domain adaptation via pseudo in-domain data selection. In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing, pages 355–362.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Satanjeev Banerjee</author>
<author>Alon Lavie</author>
</authors>
<title>METEOR: an automatic metric for MT evaluation with improved correlation with human judgments.</title>
<date>2005</date>
<booktitle>In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation,</booktitle>
<pages>65--72</pages>
<contexts>
<context position="1360" citStr="Banerjee and Lavie, 2005" startWordPosition="203" endWordPosition="206">formance experienced by researchers who use SMT as an intermediate step of their UG-NLP pipeline, and to identify translation modeling aspects that the SMT community should more urgently address to improve translation of UG data. 1 Introduction User-generated (UG) text such as found on social media and web forums poses different challenges to statistical machine translation (SMT) than formal text. This is reflected by poor translation quality for informal genres (see for example Figure 1), which is typically measured with automatic quality metrics such as BLEU (Papineni et al., 2002), METEOR (Banerjee and Lavie, 2005), or TER (Snover et al., 2006). These scores alone, however, only reflect the overall translation quality, and do not provide any insight in what exactly makes translating UG text hard. While such knowledge is crucial for improving SMT of UG text, surprisingly little work on error analysis for SMT of usergenerated text has been reported. Moreover, the notion of user-generated content ﺯﺗﻣﻝﺎﻳﻌﻟﺍﻥﺎﺷﻋﺕﻟﺎﻗ she said so the kids do not feel upset she said because of the sons In (Chinese): 你 路上 慢 点 Reference: take your time MT output: you are on the road to slow points Figure 1: SMS examples with poor</context>
</contexts>
<marker>Banerjee, Lavie, 2005</marker>
<rawString>Satanjeev Banerjee and Alon Lavie. 2005. METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation, pages 65–72.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Pratyush Banerjee</author>
<author>Sudip Kumar Naskar</author>
<author>Johann Roturier</author>
<author>Andy Way</author>
<author>Josef van Genabith</author>
</authors>
<title>Domain adaptation in statistical machine translation of user-forum data using component level mixture modelling.</title>
<date>2011</date>
<booktitle>In Proceedings of the XIII Machine Translation Summit,</booktitle>
<pages>285--292</pages>
<marker>Banerjee, Naskar, Roturier, Way, van Genabith, 2011</marker>
<rawString>Pratyush Banerjee, Sudip Kumar Naskar, Johann Roturier, Andy Way, and Josef van Genabith. 2011. Domain adaptation in statistical machine translation of user-forum data using component level mixture modelling. In Proceedings of the XIII Machine Translation Summit, pages 285–292.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Pratyush Banerjee</author>
<author>Sudip Kumar Naskar</author>
<author>Johann Roturier</author>
<author>Andy Way</author>
<author>Josef van Genabith</author>
</authors>
<title>Domain adaptation in SMT of user-generated forum content guided by OOV word reduction: Normalization and/or supplementary data.</title>
<date>2012</date>
<booktitle>In Proceedings of the 16th Conference of the European Association for Machine Translation,</booktitle>
<pages>169--176</pages>
<marker>Banerjee, Naskar, Roturier, Way, van Genabith, 2012</marker>
<rawString>Pratyush Banerjee, Sudip Kumar Naskar, Johann Roturier, Andy Way, and Josef van Genabith. 2012. Domain adaptation in SMT of user-generated forum content guided by OOV word reduction: Normalization and/or supplementary data. In Proceedings of the 16th Conference of the European Association for Machine Translation, pages 169–176.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Nicola Bertoldi</author>
<author>Mauro Cettolo</author>
<author>Marcello Federico</author>
</authors>
<title>Statistical machine translation of texts with misspelled words.</title>
<date>2010</date>
<booktitle>In Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics,</booktitle>
<pages>412--419</pages>
<contexts>
<context position="6200" citStr="Bertoldi et al., 2010" startWordPosition="1010" endWordPosition="1013">bic side. on SMT error analysis studies the effect of domain adaptation on SMT, for example by examining in which stage of the SMT pipeline the available indomain data can best be used (Duh et al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009), or models trained on formal and informal data (Banerjee et al., 2011). Finally, Roturier and Bensadoun (2011) conduct a comparative study to determine the ability of several SMT systems to translate UG text, but they do not examine what errors the systems make. To our knowledge, our work </context>
<context position="31089" citStr="Bertoldi et al., 2010" startWordPosition="5204" endWordPosition="5207">hrases, and (iii) phrase-pair OOVs pose a bigger challenge to UG translation tasks than source OOVs. In our qualitative analysis we found that common issues in UG data include (i) OOVs due to misspellings or Arabic dialectal forms, (ii) lexical choices that do not reflect colloquial formulations, (iii) phrasal idioms being translated word by word, and (iv) omitted first person pronouns in SMS and chat. Finally, different types of UG exhibit dissimilar error distributions, demanding diverse strategies to improve SMT quality. For example, SMS and chat data might benefit from text normalization (Bertoldi et al., 2010; Yvon, 2010; Ling et al., 2013a) or otherwise resolving source OOVs, which also has been the main focus of previous work on SMT for UG. On the other hand, while research in domain adaptation for SMT often aims at better scoring of existing translation candidates, we have shown that for many UG tasks the most promising direction involves increasing phrase pair recall of the SMT models (i.e., reducing phrase pair OOVs), for example by paraphrasing (Callison-Burch et al., 2006) or translation synthesis (Irvine and Callison-Burch, 2014). Acknowledgments This research was funded in part by the Net</context>
</contexts>
<marker>Bertoldi, Cettolo, Federico, 2010</marker>
<rawString>Nicola Bertoldi, Mauro Cettolo, and Marcello Federico. 2010. Statistical machine translation of texts with misspelled words. In Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics, pages 412–419.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Arianna Bisazza</author>
<author>Nick Ruiz</author>
<author>Marcello Federico</author>
</authors>
<title>Fill-up versus interpolation methods for phrase-based SMT adaptation.</title>
<date>2011</date>
<booktitle>In Proceedings of the 8th International Workshop on Spoken Language Translation,</booktitle>
<pages>136--143</pages>
<contexts>
<context position="5883" citStr="Bisazza et al., 2011" startWordPosition="956" endWordPosition="959">K 1 Chat 3.5K 22.5K 7.1K 44.5K 1 CTS 2.4K 23.1K 3.6K 40.6K 1 Comments 1.1K 25.8K 1.7K 45.5K 1 Weblogs 0.8K 14.6K 1.3K 39.9K 4 News 1 1.0K 26.9K 1.6K 46.3K 1 News 2 1.0K 34.4K 1.4K 46.6K 4 Table 1: Statistics of the Arabic-English UG (top) and contrastive news (bottom) evaluation sets. Tokens are counted on the Arabic side. on SMT error analysis studies the effect of domain adaptation on SMT, for example by examining in which stage of the SMT pipeline the available indomain data can best be used (Duh et al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based</context>
</contexts>
<marker>Bisazza, Ruiz, Federico, 2011</marker>
<rawString>Arianna Bisazza, Nick Ruiz, and Marcello Federico. 2011. Fill-up versus interpolation methods for phrase-based SMT adaptation. In Proceedings of the 8th International Workshop on Spoken Language Translation, pages 136–143.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Chris Callison-Burch</author>
<author>Philipp Koehn</author>
<author>Miles Osborne</author>
</authors>
<title>Improved statistical machine translation using paraphrases.</title>
<date>2006</date>
<booktitle>In Proceedings of the Human Language Technology Conference of the NAACL, Main Conference,</booktitle>
<pages>17--24</pages>
<marker>Callison-Burch, Koehn, Osborne, 2006</marker>
<rawString>Chris Callison-Burch, Philipp Koehn, and Miles Osborne. 2006. Improved statistical machine translation using paraphrases. In Proceedings of the Human Language Technology Conference of the NAACL, Main Conference, pages 17–24.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Jordi Carrera</author>
<author>Olga Beregovaya</author>
<author>Alex Yanishevsky</author>
</authors>
<title>Machine translation for cross-language social media.</title>
<date>2009</date>
<contexts>
<context position="6509" citStr="Carrera et al., 2009" startWordPosition="1062" endWordPosition="1065">dow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009), or models trained on formal and informal data (Banerjee et al., 2011). Finally, Roturier and Bensadoun (2011) conduct a comparative study to determine the ability of several SMT systems to translate UG text, but they do not examine what errors the systems make. To our knowledge, our work is the first that looks inside an SMT system to systematically inspect its behavior across a diverse spectrum of UG text types. 3 Experimental setup We perform our error analysis on two language pairs, Arabic-English and Chinese-English. 3.1 Evaluation sets For both language pairs we use evaluation sets for </context>
</contexts>
<marker>Carrera, Beregovaya, Yanishevsky, 2009</marker>
<rawString>Jordi Carrera, Olga Beregovaya, and Alex Yanishevsky. 2009. Machine translation for cross-language social media.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Boxing Chen</author>
<author>Roland Kuhn</author>
<author>George Foster</author>
</authors>
<title>Vector space model for adaptation in statistical machine translation.</title>
<date>2013</date>
<booktitle>In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics,</booktitle>
<pages>1285--1293</pages>
<contexts>
<context position="25937" citStr="Chen et al., 2013" startWordPosition="4324" endWordPosition="4327">g strategy for adapting SMT systems to translating UG data involves generating new target-side translation candidates that match the source phrases in the input sentences. Finally, we evaluate the fraction of SCORE errors. While this is the most commonly observed error type in most of the data sets, there seems to be very little correspondance with the genre or BLEU scores of the benchmarks. This is an interesting finding since most work in system adaptation for SMT focuses on better scoring of existing translation candidates (Matsoukas et al., 2009; Foster et al., 2010; Axelrod et al., 2011; Chen et al., 2013, among others). However, for UG translation tasks this does not appear as the most profitable approach. Qualitative results. The generated sentencelevel error annotations allow us to examine the various error types in detail. The first phenomenon that we repeatedly observe in the UG data are SEEN errors due to misspellings or, in the case of Arabic, dialectal forms. Two such examples are shown in Figures 6A and 6B: In the first, the SMT system does not recognize the dialectal form of verb negation ‘mtzEl$’, which is a morphologically complex word containing both a prefix and a suffix. In the </context>
</contexts>
<marker>Chen, Kuhn, Foster, 2013</marker>
<rawString>Boxing Chen, Roland Kuhn, and George Foster. 2013. Vector space model for adaptation in statistical machine translation. In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics, pages 1285–1293.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Kevin Duh</author>
<author>Katsuhito Sudoh</author>
<author>Hajime Tsukada</author>
</authors>
<title>Analysis of translation model adaptation in statistical machine translation.</title>
<date>2010</date>
<booktitle>In Proceedings of the 7th International Workshop on Spoken Language Translation (IWSLT</booktitle>
<pages>243--250</pages>
<contexts>
<context position="5781" citStr="Duh et al., 2010" startWordPosition="939" endWordPosition="942"> domains. Other work Genre Dev set Test set Refs Lines Tokens Lines Tokens SMS 2.7K 23.3K 7.6K 44.9K 1 Chat 3.5K 22.5K 7.1K 44.5K 1 CTS 2.4K 23.1K 3.6K 40.6K 1 Comments 1.1K 25.8K 1.7K 45.5K 1 Weblogs 0.8K 14.6K 1.3K 39.9K 4 News 1 1.0K 26.9K 1.6K 46.3K 1 News 2 1.0K 34.4K 1.4K 46.6K 4 Table 1: Statistics of the Arabic-English UG (top) and contrastive news (bottom) evaluation sets. Tokens are counted on the Arabic side. on SMT error analysis studies the effect of domain adaptation on SMT, for example by examining in which stage of the SMT pipeline the available indomain data can best be used (Duh et al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al.,</context>
</contexts>
<marker>Duh, Sudoh, Tsukada, 2010</marker>
<rawString>Kevin Duh, Katsuhito Sudoh, and Hajime Tsukada. 2010. Analysis of translation model adaptation in statistical machine translation. In Proceedings of the 7th International Workshop on Spoken Language Translation (IWSLT 2010), pages 243–250.</rawString>
</citation>
<citation valid="true">
<authors>
<author>George Foster</author>
<author>Cyril Goutte</author>
<author>Roland Kuhn</author>
</authors>
<title>Discriminative instance weighting for domain adaptation in statistical machine translation.</title>
<date>2010</date>
<booktitle>In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>451--459</pages>
<contexts>
<context position="25896" citStr="Foster et al., 2010" startWordPosition="4316" endWordPosition="4319">MS and chat. This indicates that a promising strategy for adapting SMT systems to translating UG data involves generating new target-side translation candidates that match the source phrases in the input sentences. Finally, we evaluate the fraction of SCORE errors. While this is the most commonly observed error type in most of the data sets, there seems to be very little correspondance with the genre or BLEU scores of the benchmarks. This is an interesting finding since most work in system adaptation for SMT focuses on better scoring of existing translation candidates (Matsoukas et al., 2009; Foster et al., 2010; Axelrod et al., 2011; Chen et al., 2013, among others). However, for UG translation tasks this does not appear as the most profitable approach. Qualitative results. The generated sentencelevel error annotations allow us to examine the various error types in detail. The first phenomenon that we repeatedly observe in the UG data are SEEN errors due to misspellings or, in the case of Arabic, dialectal forms. Two such examples are shown in Figures 6A and 6B: In the first, the SMT system does not recognize the dialectal form of verb negation ‘mtzEl$’, which is a morphologically complex word conta</context>
</contexts>
<marker>Foster, Goutte, Kuhn, 2010</marker>
<rawString>George Foster, Cyril Goutte, and Roland Kuhn. 2010. Discriminative instance weighting for domain adaptation in statistical machine translation. In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing, pages 451–459.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Nizar Habash</author>
<author>Owen Rambow</author>
</authors>
<title>Arabic tokenization, part-of-speech tagging and morphological disambiguation in one fell swoop.</title>
<date>2005</date>
<booktitle>In Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics,</booktitle>
<pages>573--580</pages>
<contexts>
<context position="9557" citStr="Habash and Rambow, 2005" startWordPosition="1557" endWordPosition="1560">ed benchmark sets as much as possible, however sometimes limited by availability. Tables 1 and 2 show the data specifications of the Arabic-English and ChineseEnglish evaluation sets, respectively.2 3.2 SMT systems All experiments presented in this paper are performed with our in-house state-of-the-art system based on phrase-based SMT and similar to Moses (Koehn et al., 2007). Our Arabic-English system is built from 1.75M lines (52.9M source tokens) of parallel text, and our Chinese-English system from 3.13M lines (55.4M source tokens) of parallel text. We tokenize all Arabic data using MADA (Habash and Rambow, 2005), ATB scheme, and we segment the Chinese data following Tseng et al. (2005). Both systems use an adapted 5-gram English language model that linearly interpolates different English Gigaword subcorpora with the 2Note that two evaluation sets contain four reference translations instead of one. To allow for fair comparison, we average the scores of the four references in all our analyses. English side of our bitexts, containing both news and UG data. While parallel data is scarce in general, the situation is much worse for UG data, where there are hardly any sizable parallel corpora for any langua</context>
</contexts>
<marker>Habash, Rambow, 2005</marker>
<rawString>Nizar Habash and Owen Rambow. 2005. Arabic tokenization, part-of-speech tagging and morphological disambiguation in one fell swoop. In Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics, pages 573–580.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Barry Haddow</author>
<author>Philipp Koehn</author>
</authors>
<title>Analysing the effect of out-of-domain data on SMT systems.</title>
<date>2012</date>
<booktitle>In Proceedings of the Seventh Workshop on Statistical Machine Translation,</booktitle>
<pages>422--432</pages>
<contexts>
<context position="5908" citStr="Haddow and Koehn, 2012" startWordPosition="960" endWordPosition="963">1K 44.5K 1 CTS 2.4K 23.1K 3.6K 40.6K 1 Comments 1.1K 25.8K 1.7K 45.5K 1 Weblogs 0.8K 14.6K 1.3K 39.9K 4 News 1 1.0K 26.9K 1.6K 46.3K 1 News 2 1.0K 34.4K 1.4K 46.6K 4 Table 1: Statistics of the Arabic-English UG (top) and contrastive news (bottom) evaluation sets. Tokens are counted on the Arabic side. on SMT error analysis studies the effect of domain adaptation on SMT, for example by examining in which stage of the SMT pipeline the available indomain data can best be used (Duh et al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009</context>
</contexts>
<marker>Haddow, Koehn, 2012</marker>
<rawString>Barry Haddow and Philipp Koehn. 2012. Analysing the effect of out-of-domain data on SMT systems. In Proceedings of the Seventh Workshop on Statistical Machine Translation, pages 422–432.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Mark Hopkins</author>
<author>Jonathan May</author>
</authors>
<title>Tuning as ranking.</title>
<date>2011</date>
<booktitle>In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>1352--1362</pages>
<publisher>Association</publisher>
<institution>for Computational Linguistics.</institution>
<contexts>
<context position="10608" citStr="Hopkins and May, 2011" startWordPosition="1726" endWordPosition="1729">oth news and UG data. While parallel data is scarce in general, the situation is much worse for UG data, where there are hardly any sizable parallel corpora for any language pair. As a consequence, the training data of both systems comprises 70-75% news data, mostly LDC-distributed, and 25-30% data in various other genres (weblogs, comments, editorials, speech transcripts, and small amounts of chat data), mostly harvested from the web. Per language pair, all experiments use the same SMT models, but we tune parameters separately for each benchmark set using pairwise ranking optimization (PRO) (Hopkins and May, 2011). To put the results of our system into perspective, we also run a first series of experiments on a wellknown and established online SMT system. 4 Error analysis and results We perform four series of experiments, each with the goal of answering different questions about SMT for UG text: 1. How large is the gap in translation quality between news and different types of UG data? (§4.1). To answer this question, we measure the BLEU score of two state-of-the-art SMT system outputs on all our data sets. 2. What kind of translation choices does the SMT system make for UG data? To answer this questio</context>
</contexts>
<marker>Hopkins, May, 2011</marker>
<rawString>Mark Hopkins and Jonathan May. 2011. Tuning as ranking. In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing, pages 1352–1362. Association for Computational Linguistics.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Ann Irvine</author>
<author>Chris Callison-Burch</author>
</authors>
<title>Hallucinating phrase translations for low resource MT.</title>
<date>2014</date>
<booktitle>In Proceedings of the Eighteenth Conference on Computational Natural Language Learning,</booktitle>
<pages>160--170</pages>
<marker>Irvine, Callison-Burch, 2014</marker>
<rawString>Ann Irvine and Chris Callison-Burch. 2014. Hallucinating phrase translations for low resource MT. In Proceedings of the Eighteenth Conference on Computational Natural Language Learning, pages 160– 170.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Ann Irvine</author>
<author>John Morgan</author>
<author>Marine Carpuat</author>
<author>Hal Daum´e</author>
<author>Dragos Stefan Munteanu</author>
</authors>
<title>Measuring machine translation errors in new domains.</title>
<date>2013</date>
<journal>Transactions of the Association for Computational Linguistics,</journal>
<pages>1--429</pages>
<marker>Irvine, Morgan, Carpuat, Daum´e, Munteanu, 2013</marker>
<rawString>Ann Irvine, John Morgan, Marine Carpuat, Hal Daum´e III, and Dragos Stefan Munteanu. 2013. Measuring machine translation errors in new domains. Transactions of the Association for Computational Linguistics, 1:429–440.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Laura Jehl</author>
<author>Felix Hieber</author>
<author>Stefan Riezler</author>
</authors>
<title>Twitter translation using translation-based crosslingual retrieval.</title>
<date>2012</date>
<booktitle>In Proceedings of the Seventh Workshop on Statistical Machine Translation,</booktitle>
<pages>410--421</pages>
<contexts>
<context position="6386" citStr="Jehl et al., 2012" startWordPosition="1041" endWordPosition="1044">t al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009), or models trained on formal and informal data (Banerjee et al., 2011). Finally, Roturier and Bensadoun (2011) conduct a comparative study to determine the ability of several SMT systems to translate UG text, but they do not examine what errors the systems make. To our knowledge, our work is the first that looks inside an SMT system to systematically inspect its behavior across a diverse spectrum of UG text types. 3 Experimental setup We perform our error analysis on two </context>
</contexts>
<marker>Jehl, Hieber, Riezler, 2012</marker>
<rawString>Laura Jehl, Felix Hieber, and Stefan Riezler. 2012. Twitter translation using translation-based crosslingual retrieval. In Proceedings of the Seventh Workshop on Statistical Machine Translation, pages 410–421.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Philipp Koehn</author>
<author>Franz Josef Och</author>
<author>Daniel Marcu</author>
</authors>
<title>Statistical phrase-based translation.</title>
<date>2003</date>
<booktitle>In Proceedings of the 2003 Conference of the North American Chapter of the Association for Computational Linguistics on Human Language Technology,</booktitle>
<pages>48--54</pages>
<contexts>
<context position="13900" citStr="Koehn et al., 2003" startWordPosition="2272" endWordPosition="2275">in BLEU are less pronounced, both across the different data sets and between the two SMT systems. Still, translation quality is worse for the UG data sets than for news, indicating that also for this language pair translating UG text is more challenging than translating news. As all subsequent analyses require systeminternal information, we carry out the experiments with our in-house system only. 4.2 Translation phrase length analysis Most state-of-the-art SMT systems, including our in-house system, are phrase-based, with translations being generated phrase by phrase rather than word by word (Koehn et al., 2003). An abundant use of small phrases during decoding indicates that the system is not taking advantage of the model’s ability to memorize large contextual and possibly non-compositional translation blocks. It is therefore interesting to measure the average phrase length (i.e., number of tokens) used by the system, for the source as well as the target language (Figure 3). For Arabic-English we see that source-side phrases are noticeably longer for both news benchmarks than for the UG data sets. The average target-side phrase length, on the other hand, shows less correlation with the genres of the</context>
</contexts>
<marker>Koehn, Och, Marcu, 2003</marker>
<rawString>Philipp Koehn, Franz Josef Och, and Daniel Marcu. 2003. Statistical phrase-based translation. In Proceedings of the 2003 Conference of the North American Chapter of the Association for Computational Linguistics on Human Language Technology, pages 48–54.</rawString>
</citation>
<citation valid="false">
<authors>
<author>Philipp Koehn</author>
<author>Hieu Hoang</author>
<author>Alexandra Birch</author>
<author>Chris Callison-Burch</author>
<author>Marcello Federico</author>
<author>Nicola Bertoldi</author>
<author>Brooke Cowan</author>
<author>Wade Shen</author>
<author>Christine Moran</author>
<author>Richard Zens</author>
<author>Chris Dyer</author>
<author>Ondrej Bojar</author>
<author>Alexandra Constantin</author>
<author>Evan Herbst</author>
</authors>
<title>Moses: Open source toolkit for statistical machine translation.</title>
<date>2007</date>
<booktitle>In Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions,</booktitle>
<pages>177--180</pages>
<contexts>
<context position="9311" citStr="Koehn et al., 2007" startWordPosition="1518" endWordPosition="1521">e distributions over various topics. Consequently, any observed differences between the news and UG portions of this data set can be entirely attributed to genre differences and not to potential topical variation. We have created similar-sized benchmark sets as much as possible, however sometimes limited by availability. Tables 1 and 2 show the data specifications of the Arabic-English and ChineseEnglish evaluation sets, respectively.2 3.2 SMT systems All experiments presented in this paper are performed with our in-house state-of-the-art system based on phrase-based SMT and similar to Moses (Koehn et al., 2007). Our Arabic-English system is built from 1.75M lines (52.9M source tokens) of parallel text, and our Chinese-English system from 3.13M lines (55.4M source tokens) of parallel text. We tokenize all Arabic data using MADA (Habash and Rambow, 2005), ATB scheme, and we segment the Chinese data following Tseng et al. (2005). Both systems use an adapted 5-gram English language model that linearly interpolates different English Gigaword subcorpora with the 2Note that two evaluation sets contain four reference translations instead of one. To allow for fair comparison, we average the scores of the fou</context>
<context position="15389" citStr="Koehn et al., 2007" startWordPosition="2519" endWordPosition="2522">, which is the case for most of our UG benchmarks, this can be due to (i) unreliable translation probabilities or (ii) to the mere lack of correct translation options in the models. We investigate both issues in the following analyses. 4.3 Model coverage analysis Next, we examine the translation model coverage for each data set, which tells us what phrases the system could have used for decoding. For each of our test sets, we create automatic word alignments using GIZA++ (Och and Ney, 2003), and extract from these the set of all reference phrase pairs using Moses’ phrase extraction algorithm (Koehn et al., 2007). By comparing this set of phrase pairs to the available phrases in the SMT models, which 31 Table 3: Target language model perplexity and translation model coverage of Arabic-English benchmarks. Phrase pair recall values are broken down by source phrase length. Intensities of the cell colors indicate relative recall values with respect to the best scoring benchmark (measured in BLEU). Source phrase recall Genre BLEU LM PP 1 2 3 4 News 1 33.8 65 99.7 88.9 56.3 26.1 News 2 21.5 86 99.6 88.1 53.7 21.8 Weblogs 22.3 152 99.2 80.5 40.6 13.5 Comments 17.2 117 97.7 80.2 43.0 15.3 CTS 16.0 103 97.4 66</context>
</contexts>
<marker>Koehn, Hoang, Birch, Callison-Burch, Federico, Bertoldi, Cowan, Shen, Moran, Zens, Dyer, Bojar, Constantin, Herbst, 2007</marker>
<rawString>Philipp Koehn, Hieu Hoang, Alexandra Birch, Chris Callison-Burch, Marcello Federico, Nicola Bertoldi, Brooke Cowan, Wade Shen, Christine Moran, Richard Zens, Chris Dyer, Ondrej Bojar, Alexandra Constantin, and Evan Herbst. 2007. Moses: Open source toolkit for statistical machine translation. In Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions, pages 177–180.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Wang Ling</author>
<author>Chris Dyer</author>
<author>Alan W Black</author>
<author>Isabel Trancoso</author>
</authors>
<title>Paraphrasing 4 microblog normalization.</title>
<date>2013</date>
<booktitle>In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>73--84</pages>
<contexts>
<context position="6286" citStr="Ling et al., 2013" startWordPosition="1025" endWordPosition="1028"> by examining in which stage of the SMT pipeline the available indomain data can best be used (Duh et al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009), or models trained on formal and informal data (Banerjee et al., 2011). Finally, Roturier and Bensadoun (2011) conduct a comparative study to determine the ability of several SMT systems to translate UG text, but they do not examine what errors the systems make. To our knowledge, our work is the first that looks inside an SMT system to systematically inspect its behavior ac</context>
<context position="31120" citStr="Ling et al., 2013" startWordPosition="5210" endWordPosition="5213">pose a bigger challenge to UG translation tasks than source OOVs. In our qualitative analysis we found that common issues in UG data include (i) OOVs due to misspellings or Arabic dialectal forms, (ii) lexical choices that do not reflect colloquial formulations, (iii) phrasal idioms being translated word by word, and (iv) omitted first person pronouns in SMS and chat. Finally, different types of UG exhibit dissimilar error distributions, demanding diverse strategies to improve SMT quality. For example, SMS and chat data might benefit from text normalization (Bertoldi et al., 2010; Yvon, 2010; Ling et al., 2013a) or otherwise resolving source OOVs, which also has been the main focus of previous work on SMT for UG. On the other hand, while research in domain adaptation for SMT often aims at better scoring of existing translation candidates, we have shown that for many UG tasks the most promising direction involves increasing phrase pair recall of the SMT models (i.e., reducing phrase pair OOVs), for example by paraphrasing (Callison-Burch et al., 2006) or translation synthesis (Irvine and Callison-Burch, 2014). Acknowledgments This research was funded in part by the Netherlands Organization for Scien</context>
</contexts>
<marker>Ling, Dyer, Black, Trancoso, 2013</marker>
<rawString>Wang Ling, Chris Dyer, Alan W Black, and Isabel Trancoso. 2013a. Paraphrasing 4 microblog normalization. In Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing, pages 73–84.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Wang Ling</author>
<author>Guang Xiang</author>
<author>Chris Dyer</author>
<author>Alan Black</author>
<author>Isabel Trancoso</author>
</authors>
<title>Microblogs as parallel corpora.</title>
<date>2013</date>
<booktitle>In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers),</booktitle>
<pages>176--186</pages>
<contexts>
<context position="6286" citStr="Ling et al., 2013" startWordPosition="1025" endWordPosition="1028"> by examining in which stage of the SMT pipeline the available indomain data can best be used (Duh et al., 2010), or whether it is more promising to improve either phrase extraction or scoring (Bisazza et al., 2011; Haddow and Koehn, 2012). The vast majority of SMT research, including the above described work on error analysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009), or models trained on formal and informal data (Banerjee et al., 2011). Finally, Roturier and Bensadoun (2011) conduct a comparative study to determine the ability of several SMT systems to translate UG text, but they do not examine what errors the systems make. To our knowledge, our work is the first that looks inside an SMT system to systematically inspect its behavior ac</context>
<context position="31120" citStr="Ling et al., 2013" startWordPosition="5210" endWordPosition="5213">pose a bigger challenge to UG translation tasks than source OOVs. In our qualitative analysis we found that common issues in UG data include (i) OOVs due to misspellings or Arabic dialectal forms, (ii) lexical choices that do not reflect colloquial formulations, (iii) phrasal idioms being translated word by word, and (iv) omitted first person pronouns in SMS and chat. Finally, different types of UG exhibit dissimilar error distributions, demanding diverse strategies to improve SMT quality. For example, SMS and chat data might benefit from text normalization (Bertoldi et al., 2010; Yvon, 2010; Ling et al., 2013a) or otherwise resolving source OOVs, which also has been the main focus of previous work on SMT for UG. On the other hand, while research in domain adaptation for SMT often aims at better scoring of existing translation candidates, we have shown that for many UG tasks the most promising direction involves increasing phrase pair recall of the SMT models (i.e., reducing phrase pair OOVs), for example by paraphrasing (Callison-Burch et al., 2006) or translation synthesis (Irvine and Callison-Burch, 2014). Acknowledgments This research was funded in part by the Netherlands Organization for Scien</context>
</contexts>
<marker>Ling, Xiang, Dyer, Black, Trancoso, 2013</marker>
<rawString>Wang Ling, Guang Xiang, Chris Dyer, Alan Black, and Isabel Trancoso. 2013b. Microblogs as parallel corpora. In Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 176–186.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Spyros Matsoukas</author>
<author>Antti-Veikko I Rosti</author>
<author>Bing Zhang</author>
</authors>
<title>Discriminative corpus weight estimation for machine translation.</title>
<date>2009</date>
<booktitle>In Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing,</booktitle>
<pages>708--717</pages>
<contexts>
<context position="25875" citStr="Matsoukas et al., 2009" startWordPosition="4312" endWordPosition="4315">rs for Chinese-English SMS and chat. This indicates that a promising strategy for adapting SMT systems to translating UG data involves generating new target-side translation candidates that match the source phrases in the input sentences. Finally, we evaluate the fraction of SCORE errors. While this is the most commonly observed error type in most of the data sets, there seems to be very little correspondance with the genre or BLEU scores of the benchmarks. This is an interesting finding since most work in system adaptation for SMT focuses on better scoring of existing translation candidates (Matsoukas et al., 2009; Foster et al., 2010; Axelrod et al., 2011; Chen et al., 2013, among others). However, for UG translation tasks this does not appear as the most profitable approach. Qualitative results. The generated sentencelevel error annotations allow us to examine the various error types in detail. The first phenomenon that we repeatedly observe in the UG data are SEEN errors due to misspellings or, in the case of Arabic, dialectal forms. Two such examples are shown in Figures 6A and 6B: In the first, the SMT system does not recognize the dialectal form of verb negation ‘mtzEl$’, which is a morphological</context>
</contexts>
<marker>Matsoukas, Rosti, Zhang, 2009</marker>
<rawString>Spyros Matsoukas, Antti-Veikko I. Rosti, and Bing Zhang. 2009. Discriminative corpus weight estimation for machine translation. In Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing, pages 708–717.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Franz Josef Och</author>
<author>Hermann Ney</author>
</authors>
<title>A systematic comparison of various statistical alignment models.</title>
<date>2003</date>
<journal>Computational Linguistics,</journal>
<volume>29</volume>
<issue>1</issue>
<contexts>
<context position="15265" citStr="Och and Ney, 2003" startWordPosition="2498" endWordPosition="2501"> when utilizing many small phrases rather than few large phrases. If, in spite of that, a system selects many short phrases, which is the case for most of our UG benchmarks, this can be due to (i) unreliable translation probabilities or (ii) to the mere lack of correct translation options in the models. We investigate both issues in the following analyses. 4.3 Model coverage analysis Next, we examine the translation model coverage for each data set, which tells us what phrases the system could have used for decoding. For each of our test sets, we create automatic word alignments using GIZA++ (Och and Ney, 2003), and extract from these the set of all reference phrase pairs using Moses’ phrase extraction algorithm (Koehn et al., 2007). By comparing this set of phrase pairs to the available phrases in the SMT models, which 31 Table 3: Target language model perplexity and translation model coverage of Arabic-English benchmarks. Phrase pair recall values are broken down by source phrase length. Intensities of the cell colors indicate relative recall values with respect to the best scoring benchmark (measured in BLEU). Source phrase recall Genre BLEU LM PP 1 2 3 4 News 1 33.8 65 99.7 88.9 56.3 26.1 News 2</context>
</contexts>
<marker>Och, Ney, 2003</marker>
<rawString>Franz Josef Och and Hermann Ney. 2003. A systematic comparison of various statistical alignment models. Computational Linguistics, 29(1):19–51.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Kishore Papineni</author>
<author>Salim Roukos</author>
<author>Todd Ward</author>
<author>WeiJing Zhu</author>
</authors>
<title>BLEU: a method for automatic evaluation of machine translation.</title>
<date>2002</date>
<booktitle>In Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics,</booktitle>
<pages>311--318</pages>
<contexts>
<context position="1325" citStr="Papineni et al., 2002" startWordPosition="198" endWordPosition="201">lp to demystify the poor SMT performance experienced by researchers who use SMT as an intermediate step of their UG-NLP pipeline, and to identify translation modeling aspects that the SMT community should more urgently address to improve translation of UG data. 1 Introduction User-generated (UG) text such as found on social media and web forums poses different challenges to statistical machine translation (SMT) than formal text. This is reflected by poor translation quality for informal genres (see for example Figure 1), which is typically measured with automatic quality metrics such as BLEU (Papineni et al., 2002), METEOR (Banerjee and Lavie, 2005), or TER (Snover et al., 2006). These scores alone, however, only reflect the overall translation quality, and do not provide any insight in what exactly makes translating UG text hard. While such knowledge is crucial for improving SMT of UG text, surprisingly little work on error analysis for SMT of usergenerated text has been reported. Moreover, the notion of user-generated content ﺯﺗﻣﻝﺎﻳﻌﻟﺍﻥﺎﺷﻋﺕﻟﺎﻗ she said so the kids do not feel upset she said because of the sons In (Chinese): 你 路上 慢 点 Reference: take your time MT output: you are on the road to slow poin</context>
</contexts>
<marker>Papineni, Roukos, Ward, Zhu, 2002</marker>
<rawString>Kishore Papineni, Salim Roukos, Todd Ward, and WeiJing Zhu. 2002. BLEU: a method for automatic evaluation of machine translation. In Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pages 311–318.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Maja Popovi´c</author>
<author>Hermann Ney</author>
</authors>
<title>Towards automatic error analysis of machine translation output.</title>
<date>2011</date>
<journal>Computational Linguistics,</journal>
<volume>37</volume>
<issue>4</issue>
<marker>Popovi´c, Ney, 2011</marker>
<rawString>Maja Popovi´c and Hermann Ney. 2011. Towards automatic error analysis of machine translation output. Computational Linguistics, 37(4):657–688.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Johann Roturier</author>
<author>Anthony Bensadoun</author>
</authors>
<title>Evaluation of MT systems to translate user generated content.</title>
<date>2011</date>
<booktitle>In Proceedings of the XIII Machine Translation Summit,</booktitle>
<pages>244--251</pages>
<contexts>
<context position="6620" citStr="Roturier and Bensadoun (2011)" startWordPosition="1081" endWordPosition="1084">alysis, is evaluated on data containing formal language. Work on SMT of informal text mostly targets reduction of OOV words in the source text, for example by correcting spelling errors (Bertoldi et al., 2010), normalizing noisy text to more formal text (Banerjee et al., 2012; Ling et al., 2013a), or enhancing the training data with bilingual segments extracted from Twitter (Jehl et al., 2012; Ling et al., 2013b). Other work improves SMT of UG text by combining statistical and rule-based MT (Carrera et al., 2009), or models trained on formal and informal data (Banerjee et al., 2011). Finally, Roturier and Bensadoun (2011) conduct a comparative study to determine the ability of several SMT systems to translate UG text, but they do not examine what errors the systems make. To our knowledge, our work is the first that looks inside an SMT system to systematically inspect its behavior across a diverse spectrum of UG text types. 3 Experimental setup We perform our error analysis on two language pairs, Arabic-English and Chinese-English. 3.1 Evaluation sets For both language pairs we use evaluation sets for five types of user-generated text: SMS messages, chat messages, manual transcripts of phone conversations (call</context>
</contexts>
<marker>Roturier, Bensadoun, 2011</marker>
<rawString>Johann Roturier and Anthony Bensadoun. 2011. Evaluation of MT systems to translate user generated content. In Proceedings of the XIII Machine Translation Summit, pages 244–251.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Matthew Snover</author>
<author>Bonnie Dorr</author>
<author>Richard Schwartz</author>
<author>Linnea Micciulla</author>
<author>John Makhoul</author>
</authors>
<title>A study of translation edit rate with targeted human annotation.</title>
<date>2006</date>
<booktitle>In Proceedings of the Seventh Conference of the Association for Machine Translation in the Americas,</booktitle>
<pages>223--231</pages>
<contexts>
<context position="1390" citStr="Snover et al., 2006" startWordPosition="209" endWordPosition="212"> who use SMT as an intermediate step of their UG-NLP pipeline, and to identify translation modeling aspects that the SMT community should more urgently address to improve translation of UG data. 1 Introduction User-generated (UG) text such as found on social media and web forums poses different challenges to statistical machine translation (SMT) than formal text. This is reflected by poor translation quality for informal genres (see for example Figure 1), which is typically measured with automatic quality metrics such as BLEU (Papineni et al., 2002), METEOR (Banerjee and Lavie, 2005), or TER (Snover et al., 2006). These scores alone, however, only reflect the overall translation quality, and do not provide any insight in what exactly makes translating UG text hard. While such knowledge is crucial for improving SMT of UG text, surprisingly little work on error analysis for SMT of usergenerated text has been reported. Moreover, the notion of user-generated content ﺯﺗﻣﻝﺎﻳﻌﻟﺍﻥﺎﺷﻋﺕﻟﺎﻗ she said so the kids do not feel upset she said because of the sons In (Chinese): 你 路上 慢 点 Reference: take your time MT output: you are on the road to slow points Figure 1: SMS examples with poor SMT output. only partially sp</context>
</contexts>
<marker>Snover, Dorr, Schwartz, Micciulla, Makhoul, 2006</marker>
<rawString>Matthew Snover, Bonnie Dorr, Richard Schwartz, Linnea Micciulla, and John Makhoul. 2006. A study of translation edit rate with targeted human annotation. In Proceedings of the Seventh Conference of the Association for Machine Translation in the Americas, pages 223–231.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Huihsin Tseng</author>
<author>Pichuan Chang</author>
<author>Galen Andrew</author>
<author>Daniel Jurafsky</author>
<author>Christopher Manning</author>
</authors>
<title>A conditional random field word segmenter.</title>
<date>2005</date>
<booktitle>In Proceedings of the fourth SIGHAN workshop on Chinese language Processing,</booktitle>
<volume>171</volume>
<pages>168--171</pages>
<contexts>
<context position="9632" citStr="Tseng et al. (2005)" startWordPosition="1570" endWordPosition="1573">. Tables 1 and 2 show the data specifications of the Arabic-English and ChineseEnglish evaluation sets, respectively.2 3.2 SMT systems All experiments presented in this paper are performed with our in-house state-of-the-art system based on phrase-based SMT and similar to Moses (Koehn et al., 2007). Our Arabic-English system is built from 1.75M lines (52.9M source tokens) of parallel text, and our Chinese-English system from 3.13M lines (55.4M source tokens) of parallel text. We tokenize all Arabic data using MADA (Habash and Rambow, 2005), ATB scheme, and we segment the Chinese data following Tseng et al. (2005). Both systems use an adapted 5-gram English language model that linearly interpolates different English Gigaword subcorpora with the 2Note that two evaluation sets contain four reference translations instead of one. To allow for fair comparison, we average the scores of the four references in all our analyses. English side of our bitexts, containing both news and UG data. While parallel data is scarce in general, the situation is much worse for UG data, where there are hardly any sizable parallel corpora for any language pair. As a consequence, the training data of both systems comprises 70-7</context>
</contexts>
<marker>Tseng, Chang, Andrew, Jurafsky, Manning, 2005</marker>
<rawString>Huihsin Tseng, Pichuan Chang, Galen Andrew, Daniel Jurafsky, and Christopher Manning. 2005. A conditional random field word segmenter. In Proceedings of the fourth SIGHAN workshop on Chinese language Processing, volume 171, pages 168–171.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Marlies van der Wees</author>
<author>Arianna Bisazza</author>
<author>Wouter Weerkamp</author>
<author>Christof Monz</author>
</authors>
<title>What’s in a domain? Analyzing genre and topic differences in statistical machine translation.</title>
<date>2015</date>
<booktitle>In Proceedings of the Joint Conference of the 53th Annual Meeting of the ACL and the 7th International Joint Conference on Natural Language Processing of the AFNLP.</booktitle>
<marker>van der Wees, Bisazza, Weerkamp, Monz, 2015</marker>
<rawString>Marlies van der Wees, Arianna Bisazza, Wouter Weerkamp, and Christof Monz. 2015. What’s in a domain? Analyzing genre and topic differences in statistical machine translation. In Proceedings of the Joint Conference of the 53th Annual Meeting of the ACL and the 7th International Joint Conference on Natural Language Processing of the AFNLP.</rawString>
</citation>
<citation valid="true">
<authors>
<author>Franc¸ois Yvon</author>
</authors>
<title>Rewriting the orthography of SMS messages.</title>
<date>2010</date>
<journal>Natural Language Engineering,</journal>
<volume>16</volume>
<issue>2</issue>
<contexts>
<context position="31101" citStr="Yvon, 2010" startWordPosition="5208" endWordPosition="5209">e-pair OOVs pose a bigger challenge to UG translation tasks than source OOVs. In our qualitative analysis we found that common issues in UG data include (i) OOVs due to misspellings or Arabic dialectal forms, (ii) lexical choices that do not reflect colloquial formulations, (iii) phrasal idioms being translated word by word, and (iv) omitted first person pronouns in SMS and chat. Finally, different types of UG exhibit dissimilar error distributions, demanding diverse strategies to improve SMT quality. For example, SMS and chat data might benefit from text normalization (Bertoldi et al., 2010; Yvon, 2010; Ling et al., 2013a) or otherwise resolving source OOVs, which also has been the main focus of previous work on SMT for UG. On the other hand, while research in domain adaptation for SMT often aims at better scoring of existing translation candidates, we have shown that for many UG tasks the most promising direction involves increasing phrase pair recall of the SMT models (i.e., reducing phrase pair OOVs), for example by paraphrasing (Callison-Burch et al., 2006) or translation synthesis (Irvine and Callison-Burch, 2014). Acknowledgments This research was funded in part by the Netherlands Org</context>
</contexts>
<marker>Yvon, 2010</marker>
<rawString>Franc¸ois Yvon. 2010. Rewriting the orthography of SMS messages. Natural Language Engineering, 16(2):133–159.</rawString>
</citation>
</citationList>
</algorithm>
</algorithms>