@inproceedings{scholivet-etal-2025-selexini,
title = "{SELEXINI} {--} a large and diverse automatically parsed corpus of {F}rench",
author = "Scholivet, Manon and
Savary, Agata and
Est{\`e}ve, Louis and
Candito, Marie and
Ramisch, Carlos",
editor = "Sharoff, Serge and
Terryn, Ayla Rigouts and
Zweigenbaum, Pierre and
Rapp, Reinhard",
booktitle = "Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bucc-1.10/",
pages = "83--98",
abstract = "The annotation of large text corpora is essential for many tasks. We present here a large automatically annotated corpus for French. This corpus is separated into two parts: the first from BigScience, and the second from HPLT. The annotated documents from HPLT were selected in order to optimise the lexical diversity of the final corpus SELEXINI. An analysis of the impact of this selection was carried out on syntactic diversity, as well as on the quality of the new words resulting from the HPLT part of SELEXINI. We have shown that despite the introduction of interesting new words, the texts extracted from HPLT are very noisy. Furthermore, increasing lexical diversity did not increase syntactic diversity."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="scholivet-etal-2025-selexini">
<titleInfo>
<title>SELEXINI – a large and diverse automatically parsed corpus of French</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manon</namePart>
<namePart type="family">Scholivet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agata</namePart>
<namePart type="family">Savary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis</namePart>
<namePart type="family">Estève</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Candito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Ramisch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Serge</namePart>
<namePart type="family">Sharoff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayla</namePart>
<namePart type="given">Rigouts</namePart>
<namePart type="family">Terryn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Zweigenbaum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reinhard</namePart>
<namePart type="family">Rapp</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The annotation of large text corpora is essential for many tasks. We present here a large automatically annotated corpus for French. This corpus is separated into two parts: the first from BigScience, and the second from HPLT. The annotated documents from HPLT were selected in order to optimise the lexical diversity of the final corpus SELEXINI. An analysis of the impact of this selection was carried out on syntactic diversity, as well as on the quality of the new words resulting from the HPLT part of SELEXINI. We have shown that despite the introduction of interesting new words, the texts extracted from HPLT are very noisy. Furthermore, increasing lexical diversity did not increase syntactic diversity.</abstract>
<identifier type="citekey">scholivet-etal-2025-selexini</identifier>
<location>
<url>https://aclanthology.org/2025.bucc-1.10/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>83</start>
<end>98</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SELEXINI – a large and diverse automatically parsed corpus of French
%A Scholivet, Manon
%A Savary, Agata
%A Estève, Louis
%A Candito, Marie
%A Ramisch, Carlos
%Y Sharoff, Serge
%Y Terryn, Ayla Rigouts
%Y Zweigenbaum, Pierre
%Y Rapp, Reinhard
%S Proceedings of the 18th Workshop on Building and Using Comparable Corpora (BUCC)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F scholivet-etal-2025-selexini
%X The annotation of large text corpora is essential for many tasks. We present here a large automatically annotated corpus for French. This corpus is separated into two parts: the first from BigScience, and the second from HPLT. The annotated documents from HPLT were selected in order to optimise the lexical diversity of the final corpus SELEXINI. An analysis of the impact of this selection was carried out on syntactic diversity, as well as on the quality of the new words resulting from the HPLT part of SELEXINI. We have shown that despite the introduction of interesting new words, the texts extracted from HPLT are very noisy. Furthermore, increasing lexical diversity did not increase syntactic diversity.
%U https://aclanthology.org/2025.bucc-1.10/
%P 83-98
Markdown (Informal)
[SELEXINI – a large and diverse automatically parsed corpus of French](https://aclanthology.org/2025.bucc-1.10/) (Scholivet et al., BUCC 2025)
ACL