@inproceedings{ferrero-etal-2016-multilingual,
title = "A Multilingual, Multi-style and Multi-granularity Dataset for Cross-language Textual Similarity Detection",
author = "Ferrero, J{\'e}r{\'e}my and
Agn{\`e}s, Fr{\'e}d{\'e}ric and
Besacier, Laurent and
Schwab, Didier",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1657",
pages = "4162--4169",
abstract = "In this paper we describe our effort to create a dataset for the evaluation of cross-language textual similarity detection. We present preexisting corpora and their limits and we explain the various gathered resources to overcome these limits and build our enriched dataset. The proposed dataset is multilingual, includes cross-language alignment for different granularities (from chunk to document), is based on both parallel and comparable corpora and contains human and machine translated texts. Moreover, it includes texts written by multiple types of authors (from average to professionals). With the obtained dataset, we conduct a systematic and rigorous evaluation of several state-of-the-art cross-language textual similarity detection methods. The evaluation results are reviewed and discussed. Finally, dataset and scripts are made publicly available on GitHub: \url{http://github.com/FerreroJeremy/Cross-Language-Dataset}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ferrero-etal-2016-multilingual">
<titleInfo>
<title>A Multilingual, Multi-style and Multi-granularity Dataset for Cross-language Textual Similarity Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jérémy</namePart>
<namePart type="family">Ferrero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Agnès</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Didier</namePart>
<namePart type="family">Schwab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we describe our effort to create a dataset for the evaluation of cross-language textual similarity detection. We present preexisting corpora and their limits and we explain the various gathered resources to overcome these limits and build our enriched dataset. The proposed dataset is multilingual, includes cross-language alignment for different granularities (from chunk to document), is based on both parallel and comparable corpora and contains human and machine translated texts. Moreover, it includes texts written by multiple types of authors (from average to professionals). With the obtained dataset, we conduct a systematic and rigorous evaluation of several state-of-the-art cross-language textual similarity detection methods. The evaluation results are reviewed and discussed. Finally, dataset and scripts are made publicly available on GitHub: http://github.com/FerreroJeremy/Cross-Language-Dataset.</abstract>
<identifier type="citekey">ferrero-etal-2016-multilingual</identifier>
<location>
<url>https://aclanthology.org/L16-1657</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>4162</start>
<end>4169</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Multilingual, Multi-style and Multi-granularity Dataset for Cross-language Textual Similarity Detection
%A Ferrero, Jérémy
%A Agnès, Frédéric
%A Besacier, Laurent
%A Schwab, Didier
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F ferrero-etal-2016-multilingual
%X In this paper we describe our effort to create a dataset for the evaluation of cross-language textual similarity detection. We present preexisting corpora and their limits and we explain the various gathered resources to overcome these limits and build our enriched dataset. The proposed dataset is multilingual, includes cross-language alignment for different granularities (from chunk to document), is based on both parallel and comparable corpora and contains human and machine translated texts. Moreover, it includes texts written by multiple types of authors (from average to professionals). With the obtained dataset, we conduct a systematic and rigorous evaluation of several state-of-the-art cross-language textual similarity detection methods. The evaluation results are reviewed and discussed. Finally, dataset and scripts are made publicly available on GitHub: http://github.com/FerreroJeremy/Cross-Language-Dataset.
%U https://aclanthology.org/L16-1657
%P 4162-4169
Markdown (Informal)
[A Multilingual, Multi-style and Multi-granularity Dataset for Cross-language Textual Similarity Detection](https://aclanthology.org/L16-1657) (Ferrero et al., LREC 2016)
ACL