@inproceedings{pan-etal-2025-taxonomy,
title = "Taxonomy-Driven Knowledge Graph Construction for Domain-Specific Scientific Applications",
author = "Pan, Huitong and
Zhang, Qi and
Adamu, Mustapha and
Dragut, Eduard and
Latecki, Longin Jan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.223/",
doi = "10.18653/v1/2025.findings-acl.223",
pages = "4295--4320",
ISBN = "979-8-89176-256-5",
abstract = "We present a taxonomy-driven framework for constructing domain-specific knowledge graphs (KGs) that integrates structured taxonomies, Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG). Although we focus on climate science to illustrate its effectiveness, our approach can potentially be adapted for other specialized domains. Existing methods often neglect curated taxonomies{---}hierarchies of verified entities and relationships{---}and LLMs frequently struggle to extract KGs in specialized domains. Our approach addresses these gaps by anchoring extraction to expert-curated taxonomies, aligning entities and relations with domain semantics, and validating LLM outputs using RAG against the domain taxonomy. Through a climate science case study using our annotated dataset of 25 publications (1,705 entity-publication links, 3,618 expert-validated relationships), we demonstrate that taxonomy-guided LLM prompting combined with RAG-based validation reduces hallucinations by 23.3{\%} while improving F1 scores by 13.9{\%} compared to baselines without the proposed techniques. Our contributions include: 1) a generalizable methodology for taxonomy-aligned KG construction; 2) a reproducible annotation pipeline, 3) the first benchmark dataset for climate science information retrieval; and 4) empirical insights into combining structured taxonomies with LLMs for specialized domains. The dataset, including expert annotations and taxonomy-aligned outputs, is publicly available at \url{https://github.com/Jo-Pan/ClimateIE}, and the accompanying framework can be accessed at \url{https://github.com/Jo-Pan/TaxoDrivenKG}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pan-etal-2025-taxonomy">
<titleInfo>
<title>Taxonomy-Driven Knowledge Graph Construction for Domain-Specific Scientific Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huitong</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mustapha</namePart>
<namePart type="family">Adamu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Longin</namePart>
<namePart type="given">Jan</namePart>
<namePart type="family">Latecki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>We present a taxonomy-driven framework for constructing domain-specific knowledge graphs (KGs) that integrates structured taxonomies, Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG). Although we focus on climate science to illustrate its effectiveness, our approach can potentially be adapted for other specialized domains. Existing methods often neglect curated taxonomies—hierarchies of verified entities and relationships—and LLMs frequently struggle to extract KGs in specialized domains. Our approach addresses these gaps by anchoring extraction to expert-curated taxonomies, aligning entities and relations with domain semantics, and validating LLM outputs using RAG against the domain taxonomy. Through a climate science case study using our annotated dataset of 25 publications (1,705 entity-publication links, 3,618 expert-validated relationships), we demonstrate that taxonomy-guided LLM prompting combined with RAG-based validation reduces hallucinations by 23.3% while improving F1 scores by 13.9% compared to baselines without the proposed techniques. Our contributions include: 1) a generalizable methodology for taxonomy-aligned KG construction; 2) a reproducible annotation pipeline, 3) the first benchmark dataset for climate science information retrieval; and 4) empirical insights into combining structured taxonomies with LLMs for specialized domains. The dataset, including expert annotations and taxonomy-aligned outputs, is publicly available at https://github.com/Jo-Pan/ClimateIE, and the accompanying framework can be accessed at https://github.com/Jo-Pan/TaxoDrivenKG.</abstract>
<identifier type="citekey">pan-etal-2025-taxonomy</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.223</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.223/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>4295</start>
<end>4320</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Taxonomy-Driven Knowledge Graph Construction for Domain-Specific Scientific Applications
%A Pan, Huitong
%A Zhang, Qi
%A Adamu, Mustapha
%A Dragut, Eduard
%A Latecki, Longin Jan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F pan-etal-2025-taxonomy
%X We present a taxonomy-driven framework for constructing domain-specific knowledge graphs (KGs) that integrates structured taxonomies, Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG). Although we focus on climate science to illustrate its effectiveness, our approach can potentially be adapted for other specialized domains. Existing methods often neglect curated taxonomies—hierarchies of verified entities and relationships—and LLMs frequently struggle to extract KGs in specialized domains. Our approach addresses these gaps by anchoring extraction to expert-curated taxonomies, aligning entities and relations with domain semantics, and validating LLM outputs using RAG against the domain taxonomy. Through a climate science case study using our annotated dataset of 25 publications (1,705 entity-publication links, 3,618 expert-validated relationships), we demonstrate that taxonomy-guided LLM prompting combined with RAG-based validation reduces hallucinations by 23.3% while improving F1 scores by 13.9% compared to baselines without the proposed techniques. Our contributions include: 1) a generalizable methodology for taxonomy-aligned KG construction; 2) a reproducible annotation pipeline, 3) the first benchmark dataset for climate science information retrieval; and 4) empirical insights into combining structured taxonomies with LLMs for specialized domains. The dataset, including expert annotations and taxonomy-aligned outputs, is publicly available at https://github.com/Jo-Pan/ClimateIE, and the accompanying framework can be accessed at https://github.com/Jo-Pan/TaxoDrivenKG.
%R 10.18653/v1/2025.findings-acl.223
%U https://aclanthology.org/2025.findings-acl.223/
%U https://doi.org/10.18653/v1/2025.findings-acl.223
%P 4295-4320
Markdown (Informal)
[Taxonomy-Driven Knowledge Graph Construction for Domain-Specific Scientific Applications](https://aclanthology.org/2025.findings-acl.223/) (Pan et al., Findings 2025)
ACL