@inproceedings{samy-etal-2006-building,
title = "Building a Parallel Multilingual Corpus ({A}rabic-{S}panish-{E}nglish)",
author = "Samy, Doaa and
Sandoval, Antonio Moreno and
Guirao, Jos{\'e} M. and
Alfonseca, Enrique",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Gangemi, Aldo and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Tapias, Daniel",
booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
month = may,
year = "2006",
address = "Genoa, Italy",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/238_pdf.pdf",
abstract = "This paper presents the results (1st phase) of the on-going research in the Computational Linguistics Laboratory at Aut{\'o}noma University of Madrid (LLI-UAM) aiming at the development of a multi-lingual parallel corpus (Arabic-Spanish-English) aligned on the sentence level and tagged on the POS level. A multilingual parallel corpus which brings together Arabic, Spanish and English is a new resource for the NLP community that completes the present panorama of parallel corpora. In the first part of this study, we introduce the novelty of our approach and the challenges encountered to create such a corpus. This introductory part highlights the main features of the corpus and the criteria applied during the selection process. The second part focuses on two main stages: basic processing (tokenization and segmentation) and alignment. Methodology of alignment is explained in detail and results obtained in the three different linguistic pairs are compared. POS tagging and tools used in this stage are discussed in the third part. The final output is available in two versions: the non-aligned version and the aligned one. The latter adopts the TMX (Translation Memory Exchange) standard format. At the end, the section dedicated to the future work points out the key stages concerned with extending the corpus and the studies that can benefit, directly or indirectly, from such a resource.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samy-etal-2006-building">
<titleInfo>
<title>Building a Parallel Multilingual Corpus (Arabic-Spanish-English)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Doaa</namePart>
<namePart type="family">Samy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="given">Moreno</namePart>
<namePart type="family">Sandoval</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">José</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Guirao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrique</namePart>
<namePart type="family">Alfonseca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2006-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aldo</namePart>
<namePart type="family">Gangemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Genoa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the results (1st phase) of the on-going research in the Computational Linguistics Laboratory at Autónoma University of Madrid (LLI-UAM) aiming at the development of a multi-lingual parallel corpus (Arabic-Spanish-English) aligned on the sentence level and tagged on the POS level. A multilingual parallel corpus which brings together Arabic, Spanish and English is a new resource for the NLP community that completes the present panorama of parallel corpora. In the first part of this study, we introduce the novelty of our approach and the challenges encountered to create such a corpus. This introductory part highlights the main features of the corpus and the criteria applied during the selection process. The second part focuses on two main stages: basic processing (tokenization and segmentation) and alignment. Methodology of alignment is explained in detail and results obtained in the three different linguistic pairs are compared. POS tagging and tools used in this stage are discussed in the third part. The final output is available in two versions: the non-aligned version and the aligned one. The latter adopts the TMX (Translation Memory Exchange) standard format. At the end, the section dedicated to the future work points out the key stages concerned with extending the corpus and the studies that can benefit, directly or indirectly, from such a resource.</abstract>
<identifier type="citekey">samy-etal-2006-building</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2006/pdf/238_pdf.pdf</url>
</location>
<part>
<date>2006-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building a Parallel Multilingual Corpus (Arabic-Spanish-English)
%A Samy, Doaa
%A Sandoval, Antonio Moreno
%A Guirao, José M.
%A Alfonseca, Enrique
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Gangemi, Aldo
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Tapias, Daniel
%S Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)
%D 2006
%8 May
%I European Language Resources Association (ELRA)
%C Genoa, Italy
%F samy-etal-2006-building
%X This paper presents the results (1st phase) of the on-going research in the Computational Linguistics Laboratory at Autónoma University of Madrid (LLI-UAM) aiming at the development of a multi-lingual parallel corpus (Arabic-Spanish-English) aligned on the sentence level and tagged on the POS level. A multilingual parallel corpus which brings together Arabic, Spanish and English is a new resource for the NLP community that completes the present panorama of parallel corpora. In the first part of this study, we introduce the novelty of our approach and the challenges encountered to create such a corpus. This introductory part highlights the main features of the corpus and the criteria applied during the selection process. The second part focuses on two main stages: basic processing (tokenization and segmentation) and alignment. Methodology of alignment is explained in detail and results obtained in the three different linguistic pairs are compared. POS tagging and tools used in this stage are discussed in the third part. The final output is available in two versions: the non-aligned version and the aligned one. The latter adopts the TMX (Translation Memory Exchange) standard format. At the end, the section dedicated to the future work points out the key stages concerned with extending the corpus and the studies that can benefit, directly or indirectly, from such a resource.
%U http://www.lrec-conf.org/proceedings/lrec2006/pdf/238_pdf.pdf
Markdown (Informal)
[Building a Parallel Multilingual Corpus (Arabic-Spanish-English)](http://www.lrec-conf.org/proceedings/lrec2006/pdf/238_pdf.pdf) (Samy et al., LREC 2006)
ACL