@inproceedings{sekine-grishman-1995-corpus,
title = "A Corpus-based Probabilistic Grammar with Only Two Non-terminals",
author = "Sekine, Satoshi and
Grishman, Ralph",
editor = "Hajicova, Eva and
Lang, Bernard and
Berwick, Robert and
Bunt, Harry and
Carpenter, Bob and
Church, Ken and
Joshi, Aravind and
Kaplan, Ronald and
Kay, Martin and
Nagao, Makoto and
Nijholt, Anton and
Steedman, Mark and
Thompson, Henry and
Tomita, Masaru and
Vijay-Shanker, K. and
Wilks, Yorick and
Wittenburg, Kent",
booktitle = "Proceedings of the Fourth International Workshop on Parsing Technologies",
month = sep # " 20-24",
year = "1995",
address = "Prague and Karlovy Vary, Czech Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/1995.iwpt-1.26",
pages = "216--223",
abstract = "The availability of large, syntactically-bracketed corpora such as the Penn Tree Bank affords us the opportunity to automatically build or train broad-coverage grammars, and in particular to train probabilistic grammars. A number of recent parsing experiments have also indicated that grammars whose production probabilities are dependent on the context can be more effective than context-free grammars in selecting a correct parse. To make maximal use of context, we have automatically constructed, from the Penn Tree Bank version 2, a grammar in which the symbols S and NP are the only real nonterminals, and the other non-terminals or grammatical nodes are in effect embedded into the right-hand-sides of the S and NP rules. For example, one of the rules extracted from the tree bank would be S -{\textgreater} NP VBX JJ CC VBX NP [1] ( where NP is a non-terminal and the other symbols are terminals {--} part-of-speech tags of the Tree Bank). The most common structure in the Tree Bank associated with this expansion is (S NP (VP (VP VBX (ADJ JJ) CC (VP VBX NP)))) [2]. So if our parser uses rule [1] in parsing a sentence, it will generate structure [2] for the corresponding part of the sentence. Using 94{\%} of the Penn Tree Bank for training, we extracted 32,296 distinct rules ( 23,386 for S, and 8,910 for NP). We also built a smaller version of the grammar based on higher frequency patterns for use as a back-up when the larger grammar is unable to produce a parse due to memory limitation. We applied this parser to 1,989 Wall Street Journal sentences (separate from the training set and with no limit on sentence length). Of the parsed sentences (1,899), the percentage of no-crossing sentences is 33.9{\%}, and Parseval recall and precision are 73.43{\%} and 72 .61{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sekine-grishman-1995-corpus">
<titleInfo>
<title>A Corpus-based Probabilistic Grammar with Only Two Non-terminals</title>
</titleInfo>
<name type="personal">
<namePart type="given">Satoshi</namePart>
<namePart type="family">Sekine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ralph</namePart>
<namePart type="family">Grishman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>1995-sep 20-24</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth International Workshop on Parsing Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eva</namePart>
<namePart type="family">Hajicova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bernard</namePart>
<namePart type="family">Lang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Berwick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harry</namePart>
<namePart type="family">Bunt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bob</namePart>
<namePart type="family">Carpenter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ken</namePart>
<namePart type="family">Church</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aravind</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronald</namePart>
<namePart type="family">Kaplan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Kay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Makoto</namePart>
<namePart type="family">Nagao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Nijholt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Steedman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henry</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masaru</namePart>
<namePart type="family">Tomita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">K</namePart>
<namePart type="family">Vijay-Shanker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yorick</namePart>
<namePart type="family">Wilks</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kent</namePart>
<namePart type="family">Wittenburg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague and Karlovy Vary, Czech Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The availability of large, syntactically-bracketed corpora such as the Penn Tree Bank affords us the opportunity to automatically build or train broad-coverage grammars, and in particular to train probabilistic grammars. A number of recent parsing experiments have also indicated that grammars whose production probabilities are dependent on the context can be more effective than context-free grammars in selecting a correct parse. To make maximal use of context, we have automatically constructed, from the Penn Tree Bank version 2, a grammar in which the symbols S and NP are the only real nonterminals, and the other non-terminals or grammatical nodes are in effect embedded into the right-hand-sides of the S and NP rules. For example, one of the rules extracted from the tree bank would be S -\textgreater NP VBX JJ CC VBX NP [1] ( where NP is a non-terminal and the other symbols are terminals – part-of-speech tags of the Tree Bank). The most common structure in the Tree Bank associated with this expansion is (S NP (VP (VP VBX (ADJ JJ) CC (VP VBX NP)))) [2]. So if our parser uses rule [1] in parsing a sentence, it will generate structure [2] for the corresponding part of the sentence. Using 94% of the Penn Tree Bank for training, we extracted 32,296 distinct rules ( 23,386 for S, and 8,910 for NP). We also built a smaller version of the grammar based on higher frequency patterns for use as a back-up when the larger grammar is unable to produce a parse due to memory limitation. We applied this parser to 1,989 Wall Street Journal sentences (separate from the training set and with no limit on sentence length). Of the parsed sentences (1,899), the percentage of no-crossing sentences is 33.9%, and Parseval recall and precision are 73.43% and 72 .61%.</abstract>
<identifier type="citekey">sekine-grishman-1995-corpus</identifier>
<location>
<url>https://aclanthology.org/1995.iwpt-1.26</url>
</location>
<part>
<date>1995-sep 20-24</date>
<extent unit="page">
<start>216</start>
<end>223</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Corpus-based Probabilistic Grammar with Only Two Non-terminals
%A Sekine, Satoshi
%A Grishman, Ralph
%Y Hajicova, Eva
%Y Lang, Bernard
%Y Berwick, Robert
%Y Bunt, Harry
%Y Carpenter, Bob
%Y Church, Ken
%Y Joshi, Aravind
%Y Kaplan, Ronald
%Y Kay, Martin
%Y Nagao, Makoto
%Y Nijholt, Anton
%Y Steedman, Mark
%Y Thompson, Henry
%Y Tomita, Masaru
%Y Vijay-Shanker, K.
%Y Wilks, Yorick
%Y Wittenburg, Kent
%S Proceedings of the Fourth International Workshop on Parsing Technologies
%D 1995
%8 sep 20 24
%I Association for Computational Linguistics
%C Prague and Karlovy Vary, Czech Republic
%F sekine-grishman-1995-corpus
%X The availability of large, syntactically-bracketed corpora such as the Penn Tree Bank affords us the opportunity to automatically build or train broad-coverage grammars, and in particular to train probabilistic grammars. A number of recent parsing experiments have also indicated that grammars whose production probabilities are dependent on the context can be more effective than context-free grammars in selecting a correct parse. To make maximal use of context, we have automatically constructed, from the Penn Tree Bank version 2, a grammar in which the symbols S and NP are the only real nonterminals, and the other non-terminals or grammatical nodes are in effect embedded into the right-hand-sides of the S and NP rules. For example, one of the rules extracted from the tree bank would be S -\textgreater NP VBX JJ CC VBX NP [1] ( where NP is a non-terminal and the other symbols are terminals – part-of-speech tags of the Tree Bank). The most common structure in the Tree Bank associated with this expansion is (S NP (VP (VP VBX (ADJ JJ) CC (VP VBX NP)))) [2]. So if our parser uses rule [1] in parsing a sentence, it will generate structure [2] for the corresponding part of the sentence. Using 94% of the Penn Tree Bank for training, we extracted 32,296 distinct rules ( 23,386 for S, and 8,910 for NP). We also built a smaller version of the grammar based on higher frequency patterns for use as a back-up when the larger grammar is unable to produce a parse due to memory limitation. We applied this parser to 1,989 Wall Street Journal sentences (separate from the training set and with no limit on sentence length). Of the parsed sentences (1,899), the percentage of no-crossing sentences is 33.9%, and Parseval recall and precision are 73.43% and 72 .61%.
%U https://aclanthology.org/1995.iwpt-1.26
%P 216-223
Markdown (Informal)
[A Corpus-based Probabilistic Grammar with Only Two Non-terminals](https://aclanthology.org/1995.iwpt-1.26) (Sekine & Grishman, IWPT-WS 1995)
ACL