@article{dutta-weikum-2015-cross,
title = "Cross-Document Co-Reference Resolution using Sample-Based Clustering with Knowledge Enrichment",
author = "Dutta, Sourav and
Weikum, Gerhard",
editor = "Collins, Michael and
Lee, Lillian",
journal = "Transactions of the Association for Computational Linguistics",
volume = "3",
year = "2015",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q15-1002",
doi = "10.1162/tacl_a_00119",
pages = "15--28",
abstract = "Identifying and linking named entities across information sources is the basis of knowledge acquisition and at the heart of Web search, recommendations, and analytics. An important problem in this context is cross-document co-reference resolution (CCR): computing equivalence classes of textual mentions denoting the same entity, within and across documents. Prior methods employ ranking, clustering, or probabilistic graphical models using syntactic features and distant features from knowledge bases. However, these methods exhibit limitations regarding run-time and robustness. This paper presents the CROCS framework for unsupervised CCR, improving the state of the art in two ways. First, we extend the way knowledge bases are harnessed, by constructing a notion of semantic summaries for intra-document co-reference chains using co-occurring entity mentions belonging to different chains. Second, we reduce the computational cost by a new algorithm that embeds sample-based bisection, using spectral clustering or graph partitioning, in a hierarchical clustering process. This allows scaling up CCR to large corpora. Experiments with three datasets show significant gains in output quality, compared to the best prior methods, and the run-time efficiency of CROCS.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dutta-weikum-2015-cross">
<titleInfo>
<title>Cross-Document Co-Reference Resolution using Sample-Based Clustering with Knowledge Enrichment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sourav</namePart>
<namePart type="family">Dutta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerhard</namePart>
<namePart type="family">Weikum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2015</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Identifying and linking named entities across information sources is the basis of knowledge acquisition and at the heart of Web search, recommendations, and analytics. An important problem in this context is cross-document co-reference resolution (CCR): computing equivalence classes of textual mentions denoting the same entity, within and across documents. Prior methods employ ranking, clustering, or probabilistic graphical models using syntactic features and distant features from knowledge bases. However, these methods exhibit limitations regarding run-time and robustness. This paper presents the CROCS framework for unsupervised CCR, improving the state of the art in two ways. First, we extend the way knowledge bases are harnessed, by constructing a notion of semantic summaries for intra-document co-reference chains using co-occurring entity mentions belonging to different chains. Second, we reduce the computational cost by a new algorithm that embeds sample-based bisection, using spectral clustering or graph partitioning, in a hierarchical clustering process. This allows scaling up CCR to large corpora. Experiments with three datasets show significant gains in output quality, compared to the best prior methods, and the run-time efficiency of CROCS.</abstract>
<identifier type="citekey">dutta-weikum-2015-cross</identifier>
<identifier type="doi">10.1162/tacl_a_00119</identifier>
<location>
<url>https://aclanthology.org/Q15-1002</url>
</location>
<part>
<date>2015</date>
<detail type="volume"><number>3</number></detail>
<extent unit="page">
<start>15</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Cross-Document Co-Reference Resolution using Sample-Based Clustering with Knowledge Enrichment
%A Dutta, Sourav
%A Weikum, Gerhard
%J Transactions of the Association for Computational Linguistics
%D 2015
%V 3
%I MIT Press
%C Cambridge, MA
%F dutta-weikum-2015-cross
%X Identifying and linking named entities across information sources is the basis of knowledge acquisition and at the heart of Web search, recommendations, and analytics. An important problem in this context is cross-document co-reference resolution (CCR): computing equivalence classes of textual mentions denoting the same entity, within and across documents. Prior methods employ ranking, clustering, or probabilistic graphical models using syntactic features and distant features from knowledge bases. However, these methods exhibit limitations regarding run-time and robustness. This paper presents the CROCS framework for unsupervised CCR, improving the state of the art in two ways. First, we extend the way knowledge bases are harnessed, by constructing a notion of semantic summaries for intra-document co-reference chains using co-occurring entity mentions belonging to different chains. Second, we reduce the computational cost by a new algorithm that embeds sample-based bisection, using spectral clustering or graph partitioning, in a hierarchical clustering process. This allows scaling up CCR to large corpora. Experiments with three datasets show significant gains in output quality, compared to the best prior methods, and the run-time efficiency of CROCS.
%R 10.1162/tacl_a_00119
%U https://aclanthology.org/Q15-1002
%U https://doi.org/10.1162/tacl_a_00119
%P 15-28
Markdown (Informal)
[Cross-Document Co-Reference Resolution using Sample-Based Clustering with Knowledge Enrichment](https://aclanthology.org/Q15-1002) (Dutta & Weikum, TACL 2015)
ACL