@inproceedings{puren-etal-2022-history,
title = "Between History and Natural Language Processing: Study, Enrichment and Online Publication of {F}rench Parliamentary Debates of the Early Third Republic (1881-1899)",
author = "Puren, Marie and
Pellet, Aur{\'e}lien and
Bourgeois, Nicolas and
Vernus, Pierre and
Lebreton, Fanny",
editor = "Fi{\v{s}}er, Darja and
Eskevich, Maria and
Lenardi{\v{c}}, Jakob and
de Jong, Franciska",
booktitle = "Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.parlaclarin-1.3/",
pages = "16--24",
abstract = "We present the AGODA (Analyse s{\'e}mantique et Graphes relationnels pour l`Ouverture des D{\'e}bats {\`a} l`Assembl{\'e}e nationale) project, which aims to create a platform for consulting and exploring digitised French parliamentary debates (1881-1940) available in the digital library of the National Library of France. This project brings together historians and NLP specialists: parliamentary debates are indeed an essential source for French history of the contemporary period, but also for linguistics. This project therefore aims to produce a corpus of texts that can be easily exploited with computational methods, and that respect the TEI standard. Ancient parliamentary debates are also an excellent case study for the development and application of tools for publishing and exploring large historical corpora. In this paper, we present the steps necessary to produce such a corpus. We detail the processing and publication chain of these documents, in particular by mentioning the problems linked to the extraction of texts from digitised images. We also introduce the first analyses that we have carried out on this corpus with {\textquotedblleft}bag-of-words{\textquotedblright} techniques not too sensitive to OCR quality (namely topic modelling and word embedding)."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="puren-etal-2022-history">
<titleInfo>
<title>Between History and Natural Language Processing: Study, Enrichment and Online Publication of French Parliamentary Debates of the Early Third Republic (1881-1899)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Puren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aurélien</namePart>
<namePart type="family">Pellet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Bourgeois</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Vernus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fanny</namePart>
<namePart type="family">Lebreton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Darja</namePart>
<namePart type="family">Fišer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Eskevich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakob</namePart>
<namePart type="family">Lenardič</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franciska</namePart>
<namePart type="family">de Jong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present the AGODA (Analyse sémantique et Graphes relationnels pour l‘Ouverture des Débats à l‘Assemblée nationale) project, which aims to create a platform for consulting and exploring digitised French parliamentary debates (1881-1940) available in the digital library of the National Library of France. This project brings together historians and NLP specialists: parliamentary debates are indeed an essential source for French history of the contemporary period, but also for linguistics. This project therefore aims to produce a corpus of texts that can be easily exploited with computational methods, and that respect the TEI standard. Ancient parliamentary debates are also an excellent case study for the development and application of tools for publishing and exploring large historical corpora. In this paper, we present the steps necessary to produce such a corpus. We detail the processing and publication chain of these documents, in particular by mentioning the problems linked to the extraction of texts from digitised images. We also introduce the first analyses that we have carried out on this corpus with “bag-of-words” techniques not too sensitive to OCR quality (namely topic modelling and word embedding).</abstract>
<identifier type="citekey">puren-etal-2022-history</identifier>
<location>
<url>https://aclanthology.org/2022.parlaclarin-1.3/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>16</start>
<end>24</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Between History and Natural Language Processing: Study, Enrichment and Online Publication of French Parliamentary Debates of the Early Third Republic (1881-1899)
%A Puren, Marie
%A Pellet, Aurélien
%A Bourgeois, Nicolas
%A Vernus, Pierre
%A Lebreton, Fanny
%Y Fišer, Darja
%Y Eskevich, Maria
%Y Lenardič, Jakob
%Y de Jong, Franciska
%S Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F puren-etal-2022-history
%X We present the AGODA (Analyse sémantique et Graphes relationnels pour l‘Ouverture des Débats à l‘Assemblée nationale) project, which aims to create a platform for consulting and exploring digitised French parliamentary debates (1881-1940) available in the digital library of the National Library of France. This project brings together historians and NLP specialists: parliamentary debates are indeed an essential source for French history of the contemporary period, but also for linguistics. This project therefore aims to produce a corpus of texts that can be easily exploited with computational methods, and that respect the TEI standard. Ancient parliamentary debates are also an excellent case study for the development and application of tools for publishing and exploring large historical corpora. In this paper, we present the steps necessary to produce such a corpus. We detail the processing and publication chain of these documents, in particular by mentioning the problems linked to the extraction of texts from digitised images. We also introduce the first analyses that we have carried out on this corpus with “bag-of-words” techniques not too sensitive to OCR quality (namely topic modelling and word embedding).
%U https://aclanthology.org/2022.parlaclarin-1.3/
%P 16-24
Markdown (Informal)
[Between History and Natural Language Processing: Study, Enrichment and Online Publication of French Parliamentary Debates of the Early Third Republic (1881-1899)](https://aclanthology.org/2022.parlaclarin-1.3/) (Puren et al., ParlaCLARIN 2022)
ACL