@inproceedings{agnoloni-etal-2022-making,
title = "Making {I}talian Parliamentary Records Machine-Actionable: the Construction of the {P}arla{M}int-{IT} corpus",
author = "Agnoloni, Tommaso and
Bartolini, Roberto and
Frontini, Francesca and
Montemagni, Simonetta and
Marchetti, Carlo and
Quochi, Valeria and
Ruisi, Manuela and
Venturi, Giulia",
editor = "Fi{\v{s}}er, Darja and
Eskevich, Maria and
Lenardi{\v{c}}, Jakob and
de Jong, Franciska",
booktitle = "Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.parlaclarin-1.17",
pages = "117--124",
abstract = "This paper describes the process of acquisition, cleaning, interpretation, coding and linguistic annotation of a collection of parliamentary debates from the Senate of the Italian Republic covering the COVID-19 period and a former period for reference and comparison according to the CLARIN ParlaMint guidelines and prescriptions. The corpus contains 1199 sessions and 79,373 speeches, for a total of about 31 million words and was encoded according to the ParlaCLARIN TEI XML format, as well as in CoNLL-UD format. It includes extensive metadata about the speakers, the sessions, the political parties and Parliamentary groups. As required by the ParlaMint initiative, the corpus was also linguistically annotated for sentences, tokens, POS tags, lemmas and dependency syntax according to the universal dependencies guidelines. Named entity classification was also included. All linguistic annotation was performed automatically using state-of-the-art NLP technology with no manual revision. The Italian dataset is freely available as part of the larger ParlaMint 2.1 corpus deposited and archived in CLARIN repository together with all other national corpora. It is also available for direct analysis and inspection via various CLARIN services and has already been used both for research and educational purposes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="agnoloni-etal-2022-making">
<titleInfo>
<title>Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tommaso</namePart>
<namePart type="family">Agnoloni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Bartolini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesca</namePart>
<namePart type="family">Frontini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlo</namePart>
<namePart type="family">Marchetti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valeria</namePart>
<namePart type="family">Quochi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manuela</namePart>
<namePart type="family">Ruisi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giulia</namePart>
<namePart type="family">Venturi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Darja</namePart>
<namePart type="family">Fišer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Eskevich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jakob</namePart>
<namePart type="family">Lenardič</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Franciska</namePart>
<namePart type="family">de Jong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the process of acquisition, cleaning, interpretation, coding and linguistic annotation of a collection of parliamentary debates from the Senate of the Italian Republic covering the COVID-19 period and a former period for reference and comparison according to the CLARIN ParlaMint guidelines and prescriptions. The corpus contains 1199 sessions and 79,373 speeches, for a total of about 31 million words and was encoded according to the ParlaCLARIN TEI XML format, as well as in CoNLL-UD format. It includes extensive metadata about the speakers, the sessions, the political parties and Parliamentary groups. As required by the ParlaMint initiative, the corpus was also linguistically annotated for sentences, tokens, POS tags, lemmas and dependency syntax according to the universal dependencies guidelines. Named entity classification was also included. All linguistic annotation was performed automatically using state-of-the-art NLP technology with no manual revision. The Italian dataset is freely available as part of the larger ParlaMint 2.1 corpus deposited and archived in CLARIN repository together with all other national corpora. It is also available for direct analysis and inspection via various CLARIN services and has already been used both for research and educational purposes.</abstract>
<identifier type="citekey">agnoloni-etal-2022-making</identifier>
<location>
<url>https://aclanthology.org/2022.parlaclarin-1.17</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>117</start>
<end>124</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus
%A Agnoloni, Tommaso
%A Bartolini, Roberto
%A Frontini, Francesca
%A Montemagni, Simonetta
%A Marchetti, Carlo
%A Quochi, Valeria
%A Ruisi, Manuela
%A Venturi, Giulia
%Y Fišer, Darja
%Y Eskevich, Maria
%Y Lenardič, Jakob
%Y de Jong, Franciska
%S Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F agnoloni-etal-2022-making
%X This paper describes the process of acquisition, cleaning, interpretation, coding and linguistic annotation of a collection of parliamentary debates from the Senate of the Italian Republic covering the COVID-19 period and a former period for reference and comparison according to the CLARIN ParlaMint guidelines and prescriptions. The corpus contains 1199 sessions and 79,373 speeches, for a total of about 31 million words and was encoded according to the ParlaCLARIN TEI XML format, as well as in CoNLL-UD format. It includes extensive metadata about the speakers, the sessions, the political parties and Parliamentary groups. As required by the ParlaMint initiative, the corpus was also linguistically annotated for sentences, tokens, POS tags, lemmas and dependency syntax according to the universal dependencies guidelines. Named entity classification was also included. All linguistic annotation was performed automatically using state-of-the-art NLP technology with no manual revision. The Italian dataset is freely available as part of the larger ParlaMint 2.1 corpus deposited and archived in CLARIN repository together with all other national corpora. It is also available for direct analysis and inspection via various CLARIN services and has already been used both for research and educational purposes.
%U https://aclanthology.org/2022.parlaclarin-1.17
%P 117-124
Markdown (Informal)
[Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus](https://aclanthology.org/2022.parlaclarin-1.17) (Agnoloni et al., ParlaCLARIN 2022)
ACL
- Tommaso Agnoloni, Roberto Bartolini, Francesca Frontini, Simonetta Montemagni, Carlo Marchetti, Valeria Quochi, Manuela Ruisi, and Giulia Venturi. 2022. Making Italian Parliamentary Records Machine-Actionable: the Construction of the ParlaMint-IT corpus. In Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference, pages 117–124, Marseille, France. European Language Resources Association.