@inproceedings{voutilainen-etal-2012-specifying,
title = "Specifying Treebanks, Outsourcing Parsebanks: {F}inn{T}ree{B}ank 3",
author = "Voutilainen, Atro and
Muhonen, Kristiina and
Purtonen, Tanja and
Lind{\'e}n, Krister",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/766_Paper.pdf",
pages = "1927--1931",
abstract = "Corpus-based treebank annotation is known to result in incomplete coverage of mid- and low-frequency linguistic constructions: the linguistic representation and corpus annotation quality are sometimes suboptimal. Large descriptive grammars cover also many mid- and low-frequency constructions. We argue for use of large descriptive grammars and their sample sentences as a basis for specifying higher-coverage grammatical representations. We present an sample case from an ongoing project (FIN-CLARIN FinnTreeBank) where an grammatical representation is documented as an annotator's manual alongside manual annotation of sample sentences extracted from a large descriptive grammar of Finnish. We outline the linguistic representation (morphology and dependency syntax) for Finnish, and show how the resulting `Grammar Definition Corpus' and the documentation is used as a task specification for an external subcontractor for building a parser engine for use in morphological and dependency syntactic analysis of large volumes of Finnish for parsebanking purposes. The resulting corpus, FinnTreeBank 3, is due for release in June 2012, and will contain tens of millions of words from publicly available corpora of Finnish with automatic morphological and dependency syntactic analysis, for use in research on the corpus linguistics and language engineering.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="voutilainen-etal-2012-specifying">
<titleInfo>
<title>Specifying Treebanks, Outsourcing Parsebanks: FinnTreeBank 3</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atro</namePart>
<namePart type="family">Voutilainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristiina</namePart>
<namePart type="family">Muhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Purtonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krister</namePart>
<namePart type="family">Lindén</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Corpus-based treebank annotation is known to result in incomplete coverage of mid- and low-frequency linguistic constructions: the linguistic representation and corpus annotation quality are sometimes suboptimal. Large descriptive grammars cover also many mid- and low-frequency constructions. We argue for use of large descriptive grammars and their sample sentences as a basis for specifying higher-coverage grammatical representations. We present an sample case from an ongoing project (FIN-CLARIN FinnTreeBank) where an grammatical representation is documented as an annotator’s manual alongside manual annotation of sample sentences extracted from a large descriptive grammar of Finnish. We outline the linguistic representation (morphology and dependency syntax) for Finnish, and show how the resulting ‘Grammar Definition Corpus’ and the documentation is used as a task specification for an external subcontractor for building a parser engine for use in morphological and dependency syntactic analysis of large volumes of Finnish for parsebanking purposes. The resulting corpus, FinnTreeBank 3, is due for release in June 2012, and will contain tens of millions of words from publicly available corpora of Finnish with automatic morphological and dependency syntactic analysis, for use in research on the corpus linguistics and language engineering.</abstract>
<identifier type="citekey">voutilainen-etal-2012-specifying</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/766_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>1927</start>
<end>1931</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Specifying Treebanks, Outsourcing Parsebanks: FinnTreeBank 3
%A Voutilainen, Atro
%A Muhonen, Kristiina
%A Purtonen, Tanja
%A Lindén, Krister
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F voutilainen-etal-2012-specifying
%X Corpus-based treebank annotation is known to result in incomplete coverage of mid- and low-frequency linguistic constructions: the linguistic representation and corpus annotation quality are sometimes suboptimal. Large descriptive grammars cover also many mid- and low-frequency constructions. We argue for use of large descriptive grammars and their sample sentences as a basis for specifying higher-coverage grammatical representations. We present an sample case from an ongoing project (FIN-CLARIN FinnTreeBank) where an grammatical representation is documented as an annotator’s manual alongside manual annotation of sample sentences extracted from a large descriptive grammar of Finnish. We outline the linguistic representation (morphology and dependency syntax) for Finnish, and show how the resulting ‘Grammar Definition Corpus’ and the documentation is used as a task specification for an external subcontractor for building a parser engine for use in morphological and dependency syntactic analysis of large volumes of Finnish for parsebanking purposes. The resulting corpus, FinnTreeBank 3, is due for release in June 2012, and will contain tens of millions of words from publicly available corpora of Finnish with automatic morphological and dependency syntactic analysis, for use in research on the corpus linguistics and language engineering.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/766_Paper.pdf
%P 1927-1931
Markdown (Informal)
[Specifying Treebanks, Outsourcing Parsebanks: FinnTreeBank 3](http://www.lrec-conf.org/proceedings/lrec2012/pdf/766_Paper.pdf) (Voutilainen et al., LREC 2012)
ACL
- Atro Voutilainen, Kristiina Muhonen, Tanja Purtonen, and Krister Lindén. 2012. Specifying Treebanks, Outsourcing Parsebanks: FinnTreeBank 3. In Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12), pages 1927–1931, Istanbul, Turkey. European Language Resources Association (ELRA).