@inproceedings{futrelle-etal-1991-preprocessing,
title = "Preprocessing and lexicon design for parsing technical text",
author = "Futrelle, Robert P. and
Dunn, Christopher E. and
Ellis, Debra S. and
Pescitelli, Jr., Maurice J.",
editor = "Tomita, Masaru and
Kay, Martin and
Berwick, Robert and
Hajicova, Eva and
Joshi, Aravind and
Kaplan, Ronald and
Nagao, Makoto and
Wilks, Yorick",
booktitle = "Proceedings of the Second International Workshop on Parsing Technologies",
month = feb # " 13-25",
year = "1991",
address = "Cancun, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/1991.iwpt-1.5",
pages = "31--40",
abstract = "Technical documents with complex structures and orthography present special difficulties for current parsing technology. These include technical notation such as subscripts, superscripts and numeric and algebraic expressions as well as Greek letters, italics, small capitals, brackets and punctuation marks. Structural elements such as references to figures, tables and bibliographic items also cause problems. We first hand-code documents in Standard Generalized Markup Language (SGML) to specify the document{'}s logical structure (paragraphs, sentences, etc.) and capture significant orthography. Next, a regular expression analyzer produced by LEX is used to tokenize the SGML text. Then a token-based phrasal lexicon is used to identify the longest token sequences in the input that represent single lexical items. This lookup is efficient because limits on lookahead are precomputed for every item. After this, the Alvey Tools parser with specialized subgrammars is used to discover items such as floating-point numbers. The product of these preprocessing stages is a text that is acceptable to a full natural language parser. This work is directed towards automating the building of knowledge bases from research articles in the field of bacterial chemotaxis, but the techniques should be of wide applicability.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="futrelle-etal-1991-preprocessing">
<titleInfo>
<title>Preprocessing and lexicon design for parsing technical text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Futrelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Dunn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debra</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Ellis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maurice</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Pescitelli</namePart>
<namePart type="suffix">Jr.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>1991-feb 13-25</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second International Workshop on Parsing Technologies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Masaru</namePart>
<namePart type="family">Tomita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Kay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Berwick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eva</namePart>
<namePart type="family">Hajicova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aravind</namePart>
<namePart type="family">Joshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronald</namePart>
<namePart type="family">Kaplan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Makoto</namePart>
<namePart type="family">Nagao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yorick</namePart>
<namePart type="family">Wilks</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Cancun, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Technical documents with complex structures and orthography present special difficulties for current parsing technology. These include technical notation such as subscripts, superscripts and numeric and algebraic expressions as well as Greek letters, italics, small capitals, brackets and punctuation marks. Structural elements such as references to figures, tables and bibliographic items also cause problems. We first hand-code documents in Standard Generalized Markup Language (SGML) to specify the document’s logical structure (paragraphs, sentences, etc.) and capture significant orthography. Next, a regular expression analyzer produced by LEX is used to tokenize the SGML text. Then a token-based phrasal lexicon is used to identify the longest token sequences in the input that represent single lexical items. This lookup is efficient because limits on lookahead are precomputed for every item. After this, the Alvey Tools parser with specialized subgrammars is used to discover items such as floating-point numbers. The product of these preprocessing stages is a text that is acceptable to a full natural language parser. This work is directed towards automating the building of knowledge bases from research articles in the field of bacterial chemotaxis, but the techniques should be of wide applicability.</abstract>
<identifier type="citekey">futrelle-etal-1991-preprocessing</identifier>
<location>
<url>https://aclanthology.org/1991.iwpt-1.5</url>
</location>
<part>
<date>1991-feb 13-25</date>
<extent unit="page">
<start>31</start>
<end>40</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Preprocessing and lexicon design for parsing technical text
%A Futrelle, Robert P.
%A Dunn, Christopher E.
%A Ellis, Debra S.
%A Pescitelli Jr., Maurice J.
%Y Tomita, Masaru
%Y Kay, Martin
%Y Berwick, Robert
%Y Hajicova, Eva
%Y Joshi, Aravind
%Y Kaplan, Ronald
%Y Nagao, Makoto
%Y Wilks, Yorick
%S Proceedings of the Second International Workshop on Parsing Technologies
%D 1991
%8 feb 13 25
%I Association for Computational Linguistics
%C Cancun, Mexico
%F futrelle-etal-1991-preprocessing
%X Technical documents with complex structures and orthography present special difficulties for current parsing technology. These include technical notation such as subscripts, superscripts and numeric and algebraic expressions as well as Greek letters, italics, small capitals, brackets and punctuation marks. Structural elements such as references to figures, tables and bibliographic items also cause problems. We first hand-code documents in Standard Generalized Markup Language (SGML) to specify the document’s logical structure (paragraphs, sentences, etc.) and capture significant orthography. Next, a regular expression analyzer produced by LEX is used to tokenize the SGML text. Then a token-based phrasal lexicon is used to identify the longest token sequences in the input that represent single lexical items. This lookup is efficient because limits on lookahead are precomputed for every item. After this, the Alvey Tools parser with specialized subgrammars is used to discover items such as floating-point numbers. The product of these preprocessing stages is a text that is acceptable to a full natural language parser. This work is directed towards automating the building of knowledge bases from research articles in the field of bacterial chemotaxis, but the techniques should be of wide applicability.
%U https://aclanthology.org/1991.iwpt-1.5
%P 31-40
Markdown (Informal)
[Preprocessing and lexicon design for parsing technical text](https://aclanthology.org/1991.iwpt-1.5) (Futrelle et al., IWPT 1991)
ACL
- Robert P. Futrelle, Christopher E. Dunn, Debra S. Ellis, and Maurice J. Pescitelli, Jr.. 1991. Preprocessing and lexicon design for parsing technical text. In Proceedings of the Second International Workshop on Parsing Technologies, pages 31–40, Cancun, Mexico. Association for Computational Linguistics.