@inproceedings{vanallemeersch-2010-belgisch,
title = "Belgisch Staatsblad Corpus: Retrieving {F}rench-{D}utch Sentences from Official Documents",
author = "Vanallemeersch, Tom",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}'10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2010/pdf/758_Paper.pdf",
abstract = "We describe the compilation of a large corpus of French-Dutch sentence pairs from official Belgian documents which are available in the online version of the publication Belgisch Staatsblad/Moniteur belge, and which have been published between 1997 and 2006. After downloading files in batch, we filtered out documents which have no translation in the other language, documents which contain several languages (by checking on discriminating words), and pairs of documents with a substantial difference in length. We segmented the documents into sentences and aligned the latter, which resulted in 5 million sentence pairs (only one-to-one links were included in the parallel corpus); there are 2.4 million unique pairs. Sample-based evaluation of the sentence alignment results indicates a near 100{\%} accuracy, which can be explained by the text genre, the procedure filtering out weakly parallel articles and the restriction to one-to-one links. The corpus is larger than a number of well-known French-Dutch resources. It is made available to the community. Further investigation is needed in order to determine the original language in which documents were written.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vanallemeersch-2010-belgisch">
<titleInfo>
<title>Belgisch Staatsblad Corpus: Retrieving French-Dutch Sentences from Official Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Vanallemeersch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe the compilation of a large corpus of French-Dutch sentence pairs from official Belgian documents which are available in the online version of the publication Belgisch Staatsblad/Moniteur belge, and which have been published between 1997 and 2006. After downloading files in batch, we filtered out documents which have no translation in the other language, documents which contain several languages (by checking on discriminating words), and pairs of documents with a substantial difference in length. We segmented the documents into sentences and aligned the latter, which resulted in 5 million sentence pairs (only one-to-one links were included in the parallel corpus); there are 2.4 million unique pairs. Sample-based evaluation of the sentence alignment results indicates a near 100% accuracy, which can be explained by the text genre, the procedure filtering out weakly parallel articles and the restriction to one-to-one links. The corpus is larger than a number of well-known French-Dutch resources. It is made available to the community. Further investigation is needed in order to determine the original language in which documents were written.</abstract>
<identifier type="citekey">vanallemeersch-2010-belgisch</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2010/pdf/758_Paper.pdf</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Belgisch Staatsblad Corpus: Retrieving French-Dutch Sentences from Official Documents
%A Vanallemeersch, Tom
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC’10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F vanallemeersch-2010-belgisch
%X We describe the compilation of a large corpus of French-Dutch sentence pairs from official Belgian documents which are available in the online version of the publication Belgisch Staatsblad/Moniteur belge, and which have been published between 1997 and 2006. After downloading files in batch, we filtered out documents which have no translation in the other language, documents which contain several languages (by checking on discriminating words), and pairs of documents with a substantial difference in length. We segmented the documents into sentences and aligned the latter, which resulted in 5 million sentence pairs (only one-to-one links were included in the parallel corpus); there are 2.4 million unique pairs. Sample-based evaluation of the sentence alignment results indicates a near 100% accuracy, which can be explained by the text genre, the procedure filtering out weakly parallel articles and the restriction to one-to-one links. The corpus is larger than a number of well-known French-Dutch resources. It is made available to the community. Further investigation is needed in order to determine the original language in which documents were written.
%U http://www.lrec-conf.org/proceedings/lrec2010/pdf/758_Paper.pdf
Markdown (Informal)
[Belgisch Staatsblad Corpus: Retrieving French-Dutch Sentences from Official Documents](http://www.lrec-conf.org/proceedings/lrec2010/pdf/758_Paper.pdf) (Vanallemeersch, LREC 2010)
ACL