@article{ritter-etal-2013-modeling,
title = "Modeling Missing Data in Distant Supervision for Information Extraction",
author = "Ritter, Alan and
Zettlemoyer, Luke and
{Mausam} and
Etzioni, Oren",
editor = "Lin, Dekang and
Collins, Michael",
journal = "Transactions of the Association for Computational Linguistics",
volume = "1",
year = "2013",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q13-1030",
doi = "10.1162/tacl_a_00234",
pages = "367--378",
abstract = "Distant supervision algorithms learn information extraction models given only large readily available databases and text collections. Most previous work has used heuristics for generating labeled data, for example assuming that facts not contained in the database are not mentioned in the text, and facts in the database must be mentioned at least once. In this paper, we propose a new latent-variable approach that models missing data. This provides a natural way to incorporate side information, for instance modeling the intuition that text will often mention rare entities which are likely to be missing in the database. Despite the added complexity introduced by reasoning about missing data, we demonstrate that a carefully designed local search approach to inference is very accurate and scales to large datasets. Experiments demonstrate improved performance for binary and unary relation extraction when compared to learning with heuristic labels, including on average a 27{\%} increase in area under the precision recall curve in the binary case.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ritter-etal-2013-modeling">
<titleInfo>
<title>Modeling Missing Data in Distant Supervision for Information Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Zettlemoyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Mausam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oren</namePart>
<namePart type="family">Etzioni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2013</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Distant supervision algorithms learn information extraction models given only large readily available databases and text collections. Most previous work has used heuristics for generating labeled data, for example assuming that facts not contained in the database are not mentioned in the text, and facts in the database must be mentioned at least once. In this paper, we propose a new latent-variable approach that models missing data. This provides a natural way to incorporate side information, for instance modeling the intuition that text will often mention rare entities which are likely to be missing in the database. Despite the added complexity introduced by reasoning about missing data, we demonstrate that a carefully designed local search approach to inference is very accurate and scales to large datasets. Experiments demonstrate improved performance for binary and unary relation extraction when compared to learning with heuristic labels, including on average a 27% increase in area under the precision recall curve in the binary case.</abstract>
<identifier type="citekey">ritter-etal-2013-modeling</identifier>
<identifier type="doi">10.1162/tacl_a_00234</identifier>
<location>
<url>https://aclanthology.org/Q13-1030</url>
</location>
<part>
<date>2013</date>
<detail type="volume"><number>1</number></detail>
<extent unit="page">
<start>367</start>
<end>378</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Modeling Missing Data in Distant Supervision for Information Extraction
%A Ritter, Alan
%A Zettlemoyer, Luke
%A Etzioni, Oren
%A Mausam
%J Transactions of the Association for Computational Linguistics
%D 2013
%V 1
%I MIT Press
%C Cambridge, MA
%F ritter-etal-2013-modeling
%X Distant supervision algorithms learn information extraction models given only large readily available databases and text collections. Most previous work has used heuristics for generating labeled data, for example assuming that facts not contained in the database are not mentioned in the text, and facts in the database must be mentioned at least once. In this paper, we propose a new latent-variable approach that models missing data. This provides a natural way to incorporate side information, for instance modeling the intuition that text will often mention rare entities which are likely to be missing in the database. Despite the added complexity introduced by reasoning about missing data, we demonstrate that a carefully designed local search approach to inference is very accurate and scales to large datasets. Experiments demonstrate improved performance for binary and unary relation extraction when compared to learning with heuristic labels, including on average a 27% increase in area under the precision recall curve in the binary case.
%R 10.1162/tacl_a_00234
%U https://aclanthology.org/Q13-1030
%U https://doi.org/10.1162/tacl_a_00234
%P 367-378
Markdown (Informal)
[Modeling Missing Data in Distant Supervision for Information Extraction](https://aclanthology.org/Q13-1030) (Ritter et al., TACL 2013)
ACL