@inproceedings{sosa-etal-2025-conceptual,
title = "Conceptual Diagnostics for Knowledge Graphs and Large Language Models",
author = "Sosa, Rosario Uceda and
Chang, Maria and
Natesan Ramamurthy, Karthikeyan and
Singh, Moninder",
editor = "Rehm, Georg and
Li, Yunyao",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-industry.37/",
doi = "10.18653/v1/2025.acl-industry.37",
pages = "531--540",
ISBN = "979-8-89176-288-6",
abstract = "Industrial applications pose heightened requirements for consistency and reliability of large language models (LLMs). While LLMs are being tested with increasingly complex reasoning tasks, we argue that much can be learned via diagnostic tools that probe a fundamentally basic type of reasoning: conceptual consistency, e.g., a rule applying to ``all surgeons'' must also apply to ``cardiac surgeons'' since a cardiac surgeon is a type of surgeon. In this emerging industry track submission, we propose a method that takes concept hierarchies from a knowledge graph (KG) and automatically generates benchmarks that test conceptual consistency in LLMs. We develop a multi-domain benchmark that reveals rates of conceptual inconsistencies in several state of the art LLMs. Additionally, we use measured levels of inconsistency and disagreement in LLMs to find potentially problematic subgraphs in the reference KG. As such, it offers a scalable complement to symbolic curation, maintenance, and refinement of knowledge graphs, which is a critical activity in KG-based industrial applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sosa-etal-2025-conceptual">
<titleInfo>
<title>Conceptual Diagnostics for Knowledge Graphs and Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rosario</namePart>
<namePart type="given">Uceda</namePart>
<namePart type="family">Sosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karthikeyan</namePart>
<namePart type="family">Natesan Ramamurthy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Moninder</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-288-6</identifier>
</relatedItem>
<abstract>Industrial applications pose heightened requirements for consistency and reliability of large language models (LLMs). While LLMs are being tested with increasingly complex reasoning tasks, we argue that much can be learned via diagnostic tools that probe a fundamentally basic type of reasoning: conceptual consistency, e.g., a rule applying to “all surgeons” must also apply to “cardiac surgeons” since a cardiac surgeon is a type of surgeon. In this emerging industry track submission, we propose a method that takes concept hierarchies from a knowledge graph (KG) and automatically generates benchmarks that test conceptual consistency in LLMs. We develop a multi-domain benchmark that reveals rates of conceptual inconsistencies in several state of the art LLMs. Additionally, we use measured levels of inconsistency and disagreement in LLMs to find potentially problematic subgraphs in the reference KG. As such, it offers a scalable complement to symbolic curation, maintenance, and refinement of knowledge graphs, which is a critical activity in KG-based industrial applications.</abstract>
<identifier type="citekey">sosa-etal-2025-conceptual</identifier>
<identifier type="doi">10.18653/v1/2025.acl-industry.37</identifier>
<location>
<url>https://aclanthology.org/2025.acl-industry.37/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>531</start>
<end>540</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Conceptual Diagnostics for Knowledge Graphs and Large Language Models
%A Sosa, Rosario Uceda
%A Chang, Maria
%A Natesan Ramamurthy, Karthikeyan
%A Singh, Moninder
%Y Rehm, Georg
%Y Li, Yunyao
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-288-6
%F sosa-etal-2025-conceptual
%X Industrial applications pose heightened requirements for consistency and reliability of large language models (LLMs). While LLMs are being tested with increasingly complex reasoning tasks, we argue that much can be learned via diagnostic tools that probe a fundamentally basic type of reasoning: conceptual consistency, e.g., a rule applying to “all surgeons” must also apply to “cardiac surgeons” since a cardiac surgeon is a type of surgeon. In this emerging industry track submission, we propose a method that takes concept hierarchies from a knowledge graph (KG) and automatically generates benchmarks that test conceptual consistency in LLMs. We develop a multi-domain benchmark that reveals rates of conceptual inconsistencies in several state of the art LLMs. Additionally, we use measured levels of inconsistency and disagreement in LLMs to find potentially problematic subgraphs in the reference KG. As such, it offers a scalable complement to symbolic curation, maintenance, and refinement of knowledge graphs, which is a critical activity in KG-based industrial applications.
%R 10.18653/v1/2025.acl-industry.37
%U https://aclanthology.org/2025.acl-industry.37/
%U https://doi.org/10.18653/v1/2025.acl-industry.37
%P 531-540
Markdown (Informal)
[Conceptual Diagnostics for Knowledge Graphs and Large Language Models](https://aclanthology.org/2025.acl-industry.37/) (Sosa et al., ACL 2025)
ACL
- Rosario Uceda Sosa, Maria Chang, Karthikeyan Natesan Ramamurthy, and Moninder Singh. 2025. Conceptual Diagnostics for Knowledge Graphs and Large Language Models. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 6: Industry Track), pages 531–540, Vienna, Austria. Association for Computational Linguistics.