@inproceedings{popa-fabre-etal-2020-french,
title = "{F}rench Contextualized Word-Embeddings with a sip of {C}a{B}e{R}net: a New {F}rench Balanced Reference Corpus",
author = "Popa-Fabre, Murielle and
Ortiz Su{\'a}rez, Pedro Javier and
Sagot, Beno{\^\i}t and
de la Clergerie, {\'E}ric",
editor = {Ba{\'n}ski, Piotr and
Barbaresi, Adrien and
Clematide, Simon and
Kupietz, Marc and
L{\"u}ngen, Harald and
Pisetta, Ines},
booktitle = "Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Ressources Association",
url = "https://aclanthology.org/2020.cmlc-1.3",
pages = "15--23",
abstract = "This paper investigates the impact of different types and size of training corpora on language models. By asking the fundamental question of quality versus quantity, we compare four French corpora by pre-training four different ELMos and evaluating them on dependency parsing, POS-tagging and Named Entities Recognition downstream tasks. We present and asses the relevance of a new balanced French corpus, CaBeRnet, that features a representative range of language usage, including a balanced variety of genres (oral transcriptions, newspapers, popular magazines, technical reports, fiction, academic texts), in oral and written styles. We hypothesize that a linguistically representative corpus will allow the language models to be more efficient, and therefore yield better evaluation scores on different evaluation sets and tasks. This paper offers three main contributions: (1) two newly built corpora: (a) CaBeRnet, a French Balanced Reference Corpus and (b) CBT-fr a domain-specific corpus having both oral and written style in youth literature, (2) five versions of ELMo pre-trained on differently built corpora, and (3) a whole array of computational results on downstream tasks that deepen our understanding of the effects of corpus balance and register in NLP evaluation.",
language = "English",
ISBN = "979-10-95546-61-0",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="popa-fabre-etal-2020-french">
<titleInfo>
<title>French Contextualized Word-Embeddings with a sip of CaBeRnet: a New French Balanced Reference Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Murielle</namePart>
<namePart type="family">Popa-Fabre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="given">Javier</namePart>
<namePart type="family">Ortiz Suárez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benoît</namePart>
<namePart type="family">Sagot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Éric</namePart>
<namePart type="family">de la Clergerie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Piotr</namePart>
<namePart type="family">Bański</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrien</namePart>
<namePart type="family">Barbaresi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Clematide</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marc</namePart>
<namePart type="family">Kupietz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harald</namePart>
<namePart type="family">Lüngen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ines</namePart>
<namePart type="family">Pisetta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Ressources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-61-0</identifier>
</relatedItem>
<abstract>This paper investigates the impact of different types and size of training corpora on language models. By asking the fundamental question of quality versus quantity, we compare four French corpora by pre-training four different ELMos and evaluating them on dependency parsing, POS-tagging and Named Entities Recognition downstream tasks. We present and asses the relevance of a new balanced French corpus, CaBeRnet, that features a representative range of language usage, including a balanced variety of genres (oral transcriptions, newspapers, popular magazines, technical reports, fiction, academic texts), in oral and written styles. We hypothesize that a linguistically representative corpus will allow the language models to be more efficient, and therefore yield better evaluation scores on different evaluation sets and tasks. This paper offers three main contributions: (1) two newly built corpora: (a) CaBeRnet, a French Balanced Reference Corpus and (b) CBT-fr a domain-specific corpus having both oral and written style in youth literature, (2) five versions of ELMo pre-trained on differently built corpora, and (3) a whole array of computational results on downstream tasks that deepen our understanding of the effects of corpus balance and register in NLP evaluation.</abstract>
<identifier type="citekey">popa-fabre-etal-2020-french</identifier>
<location>
<url>https://aclanthology.org/2020.cmlc-1.3</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>15</start>
<end>23</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T French Contextualized Word-Embeddings with a sip of CaBeRnet: a New French Balanced Reference Corpus
%A Popa-Fabre, Murielle
%A Ortiz Suárez, Pedro Javier
%A Sagot, Benoît
%A de la Clergerie, Éric
%Y Bański, Piotr
%Y Barbaresi, Adrien
%Y Clematide, Simon
%Y Kupietz, Marc
%Y Lüngen, Harald
%Y Pisetta, Ines
%S Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora
%D 2020
%8 May
%I European Language Ressources Association
%C Marseille, France
%@ 979-10-95546-61-0
%G English
%F popa-fabre-etal-2020-french
%X This paper investigates the impact of different types and size of training corpora on language models. By asking the fundamental question of quality versus quantity, we compare four French corpora by pre-training four different ELMos and evaluating them on dependency parsing, POS-tagging and Named Entities Recognition downstream tasks. We present and asses the relevance of a new balanced French corpus, CaBeRnet, that features a representative range of language usage, including a balanced variety of genres (oral transcriptions, newspapers, popular magazines, technical reports, fiction, academic texts), in oral and written styles. We hypothesize that a linguistically representative corpus will allow the language models to be more efficient, and therefore yield better evaluation scores on different evaluation sets and tasks. This paper offers three main contributions: (1) two newly built corpora: (a) CaBeRnet, a French Balanced Reference Corpus and (b) CBT-fr a domain-specific corpus having both oral and written style in youth literature, (2) five versions of ELMo pre-trained on differently built corpora, and (3) a whole array of computational results on downstream tasks that deepen our understanding of the effects of corpus balance and register in NLP evaluation.
%U https://aclanthology.org/2020.cmlc-1.3
%P 15-23
Markdown (Informal)
[French Contextualized Word-Embeddings with a sip of CaBeRnet: a New French Balanced Reference Corpus](https://aclanthology.org/2020.cmlc-1.3) (Popa-Fabre et al., CMLC 2020)
ACL