@inproceedings{el-kholy-habash-2010-orthographic,
title = "Orthographic and Morphological Processing for {E}nglish-{A}rabic Statistical Machine Translation",
author = "El Kholy, Ahmed and
Habash, Nizar",
editor = "Langlais, Philippe and
Gagnon, Michel",
booktitle = "Actes de la 17e conf{\'e}rence sur le Traitement Automatique des Langues Naturelles. Articles longs",
month = jul,
year = "2010",
address = "Montr{\'e}al, Canada",
publisher = "ATALA",
url = "https://aclanthology.org/2010.jeptalnrecital-long.29",
pages = "282--291",
abstract = "Much of the work on Statistical Machine Translation (SMT) from morphologically rich languages has shown that morphological tokenization and orthographic normalization help improve SMT quality because of the sparsity reduction they contribute. In this paper, we study the effect of these processes on SMT when translating into a morphologically rich language, namely Arabic. We explore a space of tokenization schemes and normalization options. We only evaluate on detokenized and orthographically correct (enriched) output. Our results show that the best performing tokenization scheme is that of the Penn Arabic Treebank. Additionally, training on orthographically normalized (reduced) text then jointly enriching and detokenizing the output outperforms training on enriched text.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="el-kholy-habash-2010-orthographic">
<titleInfo>
<title>Orthographic and Morphological Processing for English-Arabic Statistical Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">El Kholy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Actes de la 17e conférence sur le Traitement Automatique des Langues Naturelles. Articles longs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Langlais</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Gagnon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ATALA</publisher>
<place>
<placeTerm type="text">Montréal, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Much of the work on Statistical Machine Translation (SMT) from morphologically rich languages has shown that morphological tokenization and orthographic normalization help improve SMT quality because of the sparsity reduction they contribute. In this paper, we study the effect of these processes on SMT when translating into a morphologically rich language, namely Arabic. We explore a space of tokenization schemes and normalization options. We only evaluate on detokenized and orthographically correct (enriched) output. Our results show that the best performing tokenization scheme is that of the Penn Arabic Treebank. Additionally, training on orthographically normalized (reduced) text then jointly enriching and detokenizing the output outperforms training on enriched text.</abstract>
<identifier type="citekey">el-kholy-habash-2010-orthographic</identifier>
<location>
<url>https://aclanthology.org/2010.jeptalnrecital-long.29</url>
</location>
<part>
<date>2010-07</date>
<extent unit="page">
<start>282</start>
<end>291</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Orthographic and Morphological Processing for English-Arabic Statistical Machine Translation
%A El Kholy, Ahmed
%A Habash, Nizar
%Y Langlais, Philippe
%Y Gagnon, Michel
%S Actes de la 17e conférence sur le Traitement Automatique des Langues Naturelles. Articles longs
%D 2010
%8 July
%I ATALA
%C Montréal, Canada
%F el-kholy-habash-2010-orthographic
%X Much of the work on Statistical Machine Translation (SMT) from morphologically rich languages has shown that morphological tokenization and orthographic normalization help improve SMT quality because of the sparsity reduction they contribute. In this paper, we study the effect of these processes on SMT when translating into a morphologically rich language, namely Arabic. We explore a space of tokenization schemes and normalization options. We only evaluate on detokenized and orthographically correct (enriched) output. Our results show that the best performing tokenization scheme is that of the Penn Arabic Treebank. Additionally, training on orthographically normalized (reduced) text then jointly enriching and detokenizing the output outperforms training on enriched text.
%U https://aclanthology.org/2010.jeptalnrecital-long.29
%P 282-291
Markdown (Informal)
[Orthographic and Morphological Processing for English-Arabic Statistical Machine Translation](https://aclanthology.org/2010.jeptalnrecital-long.29) (El Kholy & Habash, JEP/TALN/RECITAL 2010)
ACL