@inproceedings{mikelenic-tadic-2020-building,
title = "Building the {S}panish-{C}roatian Parallel Corpus",
author = "Mikeleni{\'c}, Bojana and
Tadi{\'c}, Marko",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.484",
pages = "3932--3936",
abstract = "This paper describes the building of the first Spanish-Croatian unidirectional parallel corpus, which has been constructed at the Faculty of Humanities and Social Sciences of the University of Zagreb. The corpus is comprised of eleven Spanish novels and their translations to Croatian done by six different professional translators. All the texts were published between 1999 and 2012. The corpus has more than 2 Mw, with approximately 1 Mw for each language. It was automatically sentence segmented and aligned, as well as manually post-corrected, and contains 71,778 translation units. In order to protect the copyright and to make the corpus available under permissive CC-BY licence, the aligned translation units are shuffled. This limits the usability of the corpus for research of language units at sentence and lower language levels only. There are two versions of the corpus in TMX format that will be available for download through META-SHARE and CLARIN ERIC infrastructure. The former contains plain TMX, while the latter is lemmatised and POS-tagged and stored in the aTMX format.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mikelenic-tadic-2020-building">
<titleInfo>
<title>Building the Spanish-Croatian Parallel Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bojana</namePart>
<namePart type="family">Mikelenić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Tadić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Twelfth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>This paper describes the building of the first Spanish-Croatian unidirectional parallel corpus, which has been constructed at the Faculty of Humanities and Social Sciences of the University of Zagreb. The corpus is comprised of eleven Spanish novels and their translations to Croatian done by six different professional translators. All the texts were published between 1999 and 2012. The corpus has more than 2 Mw, with approximately 1 Mw for each language. It was automatically sentence segmented and aligned, as well as manually post-corrected, and contains 71,778 translation units. In order to protect the copyright and to make the corpus available under permissive CC-BY licence, the aligned translation units are shuffled. This limits the usability of the corpus for research of language units at sentence and lower language levels only. There are two versions of the corpus in TMX format that will be available for download through META-SHARE and CLARIN ERIC infrastructure. The former contains plain TMX, while the latter is lemmatised and POS-tagged and stored in the aTMX format.</abstract>
<identifier type="citekey">mikelenic-tadic-2020-building</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.484</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>3932</start>
<end>3936</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building the Spanish-Croatian Parallel Corpus
%A Mikelenić, Bojana
%A Tadić, Marko
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Twelfth Language Resources and Evaluation Conference
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F mikelenic-tadic-2020-building
%X This paper describes the building of the first Spanish-Croatian unidirectional parallel corpus, which has been constructed at the Faculty of Humanities and Social Sciences of the University of Zagreb. The corpus is comprised of eleven Spanish novels and their translations to Croatian done by six different professional translators. All the texts were published between 1999 and 2012. The corpus has more than 2 Mw, with approximately 1 Mw for each language. It was automatically sentence segmented and aligned, as well as manually post-corrected, and contains 71,778 translation units. In order to protect the copyright and to make the corpus available under permissive CC-BY licence, the aligned translation units are shuffled. This limits the usability of the corpus for research of language units at sentence and lower language levels only. There are two versions of the corpus in TMX format that will be available for download through META-SHARE and CLARIN ERIC infrastructure. The former contains plain TMX, while the latter is lemmatised and POS-tagged and stored in the aTMX format.
%U https://aclanthology.org/2020.lrec-1.484
%P 3932-3936
Markdown (Informal)
[Building the Spanish-Croatian Parallel Corpus](https://aclanthology.org/2020.lrec-1.484) (Mikelenić & Tadić, LREC 2020)
ACL
- Bojana Mikelenić and Marko Tadić. 2020. Building the Spanish-Croatian Parallel Corpus. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 3932–3936, Marseille, France. European Language Resources Association.