@inproceedings{yeh-etal-2016-annotated,
title = "An Annotated Corpus and Method for Analysis of Ad-Hoc Structures Embedded in Text",
author = "Yeh, Eric and
Niekrasz, John and
Freitag, Dayne and
Rohwer, Richard",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Goggi, Sara and
Grobelnik, Marko and
Maegaard, Bente and
Mariani, Joseph and
Mazo, Helene and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1327",
pages = "2063--2070",
abstract = "We describe a method for identifying and performing functional analysis of structured regions that are embedded in natural language documents, such as tables or key-value lists. Such regions often encode information according to ad hoc schemas and avail themselves of visual cues in place of natural language grammar, presenting problems for standard information extraction algorithms. Unlike previous work in table extraction, which assumes a relatively noiseless two-dimensional layout, our aim is to accommodate a wide variety of naturally occurring structure types. Our approach has three main parts. First, we collect and annotate a a diverse sample of {``}naturally{''} occurring structures from several sources. Second, we use probabilistic text segmentation techniques, featurized by skip bigrams over spatial and token category cues, to automatically identify contiguous regions of structured text that share a common schema. Finally, we identify the records and fields within each structured region using a combination of distributional similarity and sequence alignment methods, guided by minimal supervision in the form of a single annotated record. We evaluate the last two components individually, and conclude with a discussion of further work.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yeh-etal-2016-annotated">
<titleInfo>
<title>An Annotated Corpus and Method for Analysis of Ad-Hoc Structures Embedded in Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Yeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Niekrasz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayne</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Rohwer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Grobelnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helene</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Portorož, Slovenia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe a method for identifying and performing functional analysis of structured regions that are embedded in natural language documents, such as tables or key-value lists. Such regions often encode information according to ad hoc schemas and avail themselves of visual cues in place of natural language grammar, presenting problems for standard information extraction algorithms. Unlike previous work in table extraction, which assumes a relatively noiseless two-dimensional layout, our aim is to accommodate a wide variety of naturally occurring structure types. Our approach has three main parts. First, we collect and annotate a a diverse sample of “naturally” occurring structures from several sources. Second, we use probabilistic text segmentation techniques, featurized by skip bigrams over spatial and token category cues, to automatically identify contiguous regions of structured text that share a common schema. Finally, we identify the records and fields within each structured region using a combination of distributional similarity and sequence alignment methods, guided by minimal supervision in the form of a single annotated record. We evaluate the last two components individually, and conclude with a discussion of further work.</abstract>
<identifier type="citekey">yeh-etal-2016-annotated</identifier>
<location>
<url>https://aclanthology.org/L16-1327</url>
</location>
<part>
<date>2016-05</date>
<extent unit="page">
<start>2063</start>
<end>2070</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Annotated Corpus and Method for Analysis of Ad-Hoc Structures Embedded in Text
%A Yeh, Eric
%A Niekrasz, John
%A Freitag, Dayne
%A Rohwer, Richard
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Grobelnik, Marko
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Helene
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC’16)
%D 2016
%8 May
%I European Language Resources Association (ELRA)
%C Portorož, Slovenia
%F yeh-etal-2016-annotated
%X We describe a method for identifying and performing functional analysis of structured regions that are embedded in natural language documents, such as tables or key-value lists. Such regions often encode information according to ad hoc schemas and avail themselves of visual cues in place of natural language grammar, presenting problems for standard information extraction algorithms. Unlike previous work in table extraction, which assumes a relatively noiseless two-dimensional layout, our aim is to accommodate a wide variety of naturally occurring structure types. Our approach has three main parts. First, we collect and annotate a a diverse sample of “naturally” occurring structures from several sources. Second, we use probabilistic text segmentation techniques, featurized by skip bigrams over spatial and token category cues, to automatically identify contiguous regions of structured text that share a common schema. Finally, we identify the records and fields within each structured region using a combination of distributional similarity and sequence alignment methods, guided by minimal supervision in the form of a single annotated record. We evaluate the last two components individually, and conclude with a discussion of further work.
%U https://aclanthology.org/L16-1327
%P 2063-2070
Markdown (Informal)
[An Annotated Corpus and Method for Analysis of Ad-Hoc Structures Embedded in Text](https://aclanthology.org/L16-1327) (Yeh et al., LREC 2016)
ACL