@inproceedings{neves-etal-2016-scielo,
title = "The Scielo Corpus: a Parallel Corpus of Scientific Publications for Biomedicine",
author = "Neves, Mariana and
Yepes, Antonio Jimeno and
N{\'e}v{\'e}ol, Aur{\'e}lie",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1470",
pages = "2942--2948",
abstract = "The biomedical scientific literature is a rich source of information not only in the English language, for which it is more abundant, but also in other languages, such as Portuguese, Spanish and French. We present the first freely available parallel corpus of scientific publications for the biomedical domain. Documents from the {''}Biological Sciences{''} and {''}Health Sciences{''} categories were retrieved from the Scielo database and parallel titles and abstracts are available for the following language pairs: Portuguese/English (about 86,000 documents in total), Spanish/English (about 95,000 documents) and French/English (about 2,000 documents). Additionally, monolingual data was also collected for all four languages. Sentences in the parallel corpus were automatically aligned and a manual analysis of 200 documents by native experts found that a minimum of 79{\%} of sentences were correctly aligned in all language pairs. We demonstrate the utility of the corpus by running baseline machine translation experiments. We show that for all language pairs, a statistical machine translation system trained on the parallel corpora achieves performance that rivals or exceeds the state of the art in the biomedical domain. Furthermore, the corpora are currently being used in the biomedical task in the First Conference on Machine Translation (WMT{'}16).",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="neves-etal-2016-scielo">
<titleInfo>
<title>The Scielo Corpus: a Parallel Corpus of Scientific Publications for Biomedicine</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Neves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="given">Jimeno</namePart>
<namePart type="family">Yepes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurélie</namePart>
<namePart type="family">Névéol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The biomedical scientific literature is a rich source of information not only in the English language, for which it is more abundant, but also in other languages, such as Portuguese, Spanish and French. We present the first freely available parallel corpus of scientific publications for the biomedical domain. Documents from the ”Biological Sciences” and ”Health Sciences” categories were retrieved from the Scielo database and parallel titles and abstracts are available for the following language pairs: Portuguese/English (about 86,000 documents in total), Spanish/English (about 95,000 documents) and French/English (about 2,000 documents). Additionally, monolingual data was also collected for all four languages. Sentences in the parallel corpus were automatically aligned and a manual analysis of 200 documents by native experts found that a minimum of 79% of sentences were correctly aligned in all language pairs. We demonstrate the utility of the corpus by running baseline machine translation experiments. We show that for all language pairs, a statistical machine translation system trained on the parallel corpora achieves performance that rivals or exceeds the state of the art in the biomedical domain. Furthermore, the corpora are currently being used in the biomedical task in the First Conference on Machine Translation (WMT’16).</abstract>
<identifier type="citekey">neves-etal-2016-scielo</identifier>
<location>
<url>https://aclanthology.org/L16-1470</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>2942</start>
<end>2948</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Scielo Corpus: a Parallel Corpus of Scientific Publications for Biomedicine
%A Neves, Mariana
%A Yepes, Antonio Jimeno
%A Névéol, Aurélie
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F neves-etal-2016-scielo
%X The biomedical scientific literature is a rich source of information not only in the English language, for which it is more abundant, but also in other languages, such as Portuguese, Spanish and French. We present the first freely available parallel corpus of scientific publications for the biomedical domain. Documents from the ”Biological Sciences” and ”Health Sciences” categories were retrieved from the Scielo database and parallel titles and abstracts are available for the following language pairs: Portuguese/English (about 86,000 documents in total), Spanish/English (about 95,000 documents) and French/English (about 2,000 documents). Additionally, monolingual data was also collected for all four languages. Sentences in the parallel corpus were automatically aligned and a manual analysis of 200 documents by native experts found that a minimum of 79% of sentences were correctly aligned in all language pairs. We demonstrate the utility of the corpus by running baseline machine translation experiments. We show that for all language pairs, a statistical machine translation system trained on the parallel corpora achieves performance that rivals or exceeds the state of the art in the biomedical domain. Furthermore, the corpora are currently being used in the biomedical task in the First Conference on Machine Translation (WMT’16).
%U https://aclanthology.org/L16-1470
%P 2942-2948
Markdown (Informal)
[The Scielo Corpus: a Parallel Corpus of Scientific Publications for Biomedicine](https://aclanthology.org/L16-1470) (Neves et al., LREC 2016)
ACL