@inproceedings{ljubesic-etal-2016-producing,
title = "Producing Monolingual and Parallel Web Corpora at the Same Time - {S}pider{L}ing and Bitextor{'}s Love Affair",
author = "Ljube{\v{s}}i{\'c}, Nikola and
Espl{\`a}-Gomis, Miquel and
Toral, Antonio and
Rojas, Sergio Ortiz and
Klubi{\v{c}}ka, Filip",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1471",
pages = "2949--2956",
abstract = "This paper presents an approach for building large monolingual corpora and, at the same time, extracting parallel data by crawling the top-level domain of a given language of interest. For gathering linguistically relevant data from top-level domains we use the SpiderLing crawler, modified to crawl data written in multiple languages. The output of this process is then fed to Bitextor, a tool for harvesting parallel data from a collection of documents. We call the system combining these two tools Spidextor, a blend of the names of its two crucial parts. We evaluate the described approach intrinsically by measuring the accuracy of the extracted bitexts from the Croatian top-level domain {``}.hr{''} and the Slovene top-level domain {``}.si{''}, and extrinsically on the English-Croatian language pair by comparing an SMT system built from the crawled data with third-party systems. We finally present parallel datasets collected with our approach for the English-Croatian, English-Finnish, English-Serbian and English-Slovene language pairs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ljubesic-etal-2016-producing">
<titleInfo>
<title>Producing Monolingual and Parallel Web Corpora at the Same Time - SpiderLing and Bitextor’s Love Affair</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="given">Ortiz</namePart>
<namePart type="family">Rojas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Klubička</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents an approach for building large monolingual corpora and, at the same time, extracting parallel data by crawling the top-level domain of a given language of interest. For gathering linguistically relevant data from top-level domains we use the SpiderLing crawler, modified to crawl data written in multiple languages. The output of this process is then fed to Bitextor, a tool for harvesting parallel data from a collection of documents. We call the system combining these two tools Spidextor, a blend of the names of its two crucial parts. We evaluate the described approach intrinsically by measuring the accuracy of the extracted bitexts from the Croatian top-level domain “.hr” and the Slovene top-level domain “.si”, and extrinsically on the English-Croatian language pair by comparing an SMT system built from the crawled data with third-party systems. We finally present parallel datasets collected with our approach for the English-Croatian, English-Finnish, English-Serbian and English-Slovene language pairs.</abstract>
<identifier type="citekey">ljubesic-etal-2016-producing</identifier>
<location>
<url>https://aclanthology.org/L16-1471</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>2949</start>
<end>2956</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Producing Monolingual and Parallel Web Corpora at the Same Time - SpiderLing and Bitextor’s Love Affair
%A Ljubešić, Nikola
%A Esplà-Gomis, Miquel
%A Toral, Antonio
%A Rojas, Sergio Ortiz
%A Klubička, Filip
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F ljubesic-etal-2016-producing
%X This paper presents an approach for building large monolingual corpora and, at the same time, extracting parallel data by crawling the top-level domain of a given language of interest. For gathering linguistically relevant data from top-level domains we use the SpiderLing crawler, modified to crawl data written in multiple languages. The output of this process is then fed to Bitextor, a tool for harvesting parallel data from a collection of documents. We call the system combining these two tools Spidextor, a blend of the names of its two crucial parts. We evaluate the described approach intrinsically by measuring the accuracy of the extracted bitexts from the Croatian top-level domain “.hr” and the Slovene top-level domain “.si”, and extrinsically on the English-Croatian language pair by comparing an SMT system built from the crawled data with third-party systems. We finally present parallel datasets collected with our approach for the English-Croatian, English-Finnish, English-Serbian and English-Slovene language pairs.
%U https://aclanthology.org/L16-1471
%P 2949-2956
Markdown (Informal)
[Producing Monolingual and Parallel Web Corpora at the Same Time - SpiderLing and Bitextor’s Love Affair](https://aclanthology.org/L16-1471) (Ljubešić et al., LREC 2016)
ACL