@inproceedings{ashury-tahan-etal-2024-data,
title = "Data-driven Coreference-based Ontology Building",
author = "Ashury Tahan, Shir and
Cohen, Amir and
Cohen, Nadav and
Louzoun, Yoram and
Goldberg, Yoav",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.834",
pages = "14290--14300",
abstract = "While coreference resolution is traditionally used as a component in individual document understanding, in this work we take a more global view and explore what can we learn about a domain from the set of all document-level coreference relations that are present in a large corpus. We derive coreference chains from a corpus of 30 million biomedical abstracts and construct a graph based on the string phrases within these chains, establishing connections between phrases if they co-occur within the same coreference chain. We then use the graph structure and the betweeness centrality measure to distinguish between edges denoting hierarchy, identity and noise, assign directionality to edges denoting hierarchy, and split nodes (strings) that correspond to multiple distinct concepts. The result is a rich, data-driven ontology over concepts in the biomedical domain, parts of which overlaps significantly with human-authored ontologies. We release the coreference chains and resulting ontology under a creative-commons license.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ashury-tahan-etal-2024-data">
<titleInfo>
<title>Data-driven Coreference-based Ontology Building</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shir</namePart>
<namePart type="family">Ashury Tahan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadav</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoram</namePart>
<namePart type="family">Louzoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While coreference resolution is traditionally used as a component in individual document understanding, in this work we take a more global view and explore what can we learn about a domain from the set of all document-level coreference relations that are present in a large corpus. We derive coreference chains from a corpus of 30 million biomedical abstracts and construct a graph based on the string phrases within these chains, establishing connections between phrases if they co-occur within the same coreference chain. We then use the graph structure and the betweeness centrality measure to distinguish between edges denoting hierarchy, identity and noise, assign directionality to edges denoting hierarchy, and split nodes (strings) that correspond to multiple distinct concepts. The result is a rich, data-driven ontology over concepts in the biomedical domain, parts of which overlaps significantly with human-authored ontologies. We release the coreference chains and resulting ontology under a creative-commons license.</abstract>
<identifier type="citekey">ashury-tahan-etal-2024-data</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.834</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>14290</start>
<end>14300</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data-driven Coreference-based Ontology Building
%A Ashury Tahan, Shir
%A Cohen, Amir
%A Cohen, Nadav
%A Louzoun, Yoram
%A Goldberg, Yoav
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F ashury-tahan-etal-2024-data
%X While coreference resolution is traditionally used as a component in individual document understanding, in this work we take a more global view and explore what can we learn about a domain from the set of all document-level coreference relations that are present in a large corpus. We derive coreference chains from a corpus of 30 million biomedical abstracts and construct a graph based on the string phrases within these chains, establishing connections between phrases if they co-occur within the same coreference chain. We then use the graph structure and the betweeness centrality measure to distinguish between edges denoting hierarchy, identity and noise, assign directionality to edges denoting hierarchy, and split nodes (strings) that correspond to multiple distinct concepts. The result is a rich, data-driven ontology over concepts in the biomedical domain, parts of which overlaps significantly with human-authored ontologies. We release the coreference chains and resulting ontology under a creative-commons license.
%U https://aclanthology.org/2024.findings-emnlp.834
%P 14290-14300
Markdown (Informal)
[Data-driven Coreference-based Ontology Building](https://aclanthology.org/2024.findings-emnlp.834) (Ashury Tahan et al., Findings 2024)
ACL
- Shir Ashury Tahan, Amir Cohen, Nadav Cohen, Yoram Louzoun, and Yoav Goldberg. 2024. Data-driven Coreference-based Ontology Building. In Findings of the Association for Computational Linguistics: EMNLP 2024, pages 14290–14300, Miami, Florida, USA. Association for Computational Linguistics.