@inproceedings{ogrodniczuk-2012-polish,
title = "The {P}olish Sejm Corpus",
author = "Ogrodniczuk, Maciej",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/653_Paper.pdf",
pages = "2219--2223",
abstract = "This document presents the first edition of the Polish Sejm Corpus -- a new specialized resource containing transcribed, automatically annotated utterances of the Members of Polish Sejm (lower chamber of the Polish Parliament). The corpus data encoding is inherited from the National Corpus of Polish and enhanced with session metadata and structure. The multi-layered stand-off annotation contains sentence- and token-level segmentation, disambiguated morphosyntactic information, syntactic words and groups resulting from shallow parsing and named entities. The paper also outlines several novel ideas for corpus preparation, e.g. the notion of a live corpus, constantly populated with new data or the concept of linking corpus data with external databases to enrich content. Although initial statistical comparison of the resource with the balanced corpus of general Polish reveals substantial differences in language richness, the resource makes a valuable source of linguistic information as a large (300 M segments) collection of quasi-spoken data ready to be aligned with the audio/video recording of sessions, currently being made publicly available by Sejm.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ogrodniczuk-2012-polish">
<titleInfo>
<title>The Polish Sejm Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Ogrodniczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This document presents the first edition of the Polish Sejm Corpus – a new specialized resource containing transcribed, automatically annotated utterances of the Members of Polish Sejm (lower chamber of the Polish Parliament). The corpus data encoding is inherited from the National Corpus of Polish and enhanced with session metadata and structure. The multi-layered stand-off annotation contains sentence- and token-level segmentation, disambiguated morphosyntactic information, syntactic words and groups resulting from shallow parsing and named entities. The paper also outlines several novel ideas for corpus preparation, e.g. the notion of a live corpus, constantly populated with new data or the concept of linking corpus data with external databases to enrich content. Although initial statistical comparison of the resource with the balanced corpus of general Polish reveals substantial differences in language richness, the resource makes a valuable source of linguistic information as a large (300 M segments) collection of quasi-spoken data ready to be aligned with the audio/video recording of sessions, currently being made publicly available by Sejm.</abstract>
<identifier type="citekey">ogrodniczuk-2012-polish</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/653_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>2219</start>
<end>2223</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Polish Sejm Corpus
%A Ogrodniczuk, Maciej
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F ogrodniczuk-2012-polish
%X This document presents the first edition of the Polish Sejm Corpus – a new specialized resource containing transcribed, automatically annotated utterances of the Members of Polish Sejm (lower chamber of the Polish Parliament). The corpus data encoding is inherited from the National Corpus of Polish and enhanced with session metadata and structure. The multi-layered stand-off annotation contains sentence- and token-level segmentation, disambiguated morphosyntactic information, syntactic words and groups resulting from shallow parsing and named entities. The paper also outlines several novel ideas for corpus preparation, e.g. the notion of a live corpus, constantly populated with new data or the concept of linking corpus data with external databases to enrich content. Although initial statistical comparison of the resource with the balanced corpus of general Polish reveals substantial differences in language richness, the resource makes a valuable source of linguistic information as a large (300 M segments) collection of quasi-spoken data ready to be aligned with the audio/video recording of sessions, currently being made publicly available by Sejm.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/653_Paper.pdf
%P 2219-2223
Markdown (Informal)
[The Polish Sejm Corpus](http://www.lrec-conf.org/proceedings/lrec2012/pdf/653_Paper.pdf) (Ogrodniczuk, LREC 2012)
ACL
- Maciej Ogrodniczuk. 2012. The Polish Sejm Corpus. In Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12), pages 2219–2223, Istanbul, Turkey. European Language Resources Association (ELRA).