@inproceedings{rubinfeld-etal-2026-ontological,
title = "Ontological Validation of Biomedical Topic Models: {SNOMED} {CT} Hierarchy Distance as an Automated Evaluation Metric",
author = "Rubinfeld, Ilan and
Zaidi, Sami and
Djuric, Milosh and
Kabbani, Loay and
Halabi, Mouhammad and
Shepard, Alex",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-1.27/",
pages = "342--352",
ISBN = "979-8-89176-434-7",
abstract = "Standard coherence metrics for biomedical topic models encode no clinical knowledge and cannot detect clinically implausible topic groupings. We propose SNOMED CT Wu?Palmer hierarchy distance as a post hoc, ontology-grounded diagnostic. On vascular surgery (47,318 articles) and craniofacial surgery (27,493 articles) corpora, the metric flags clinically heterogeneous topics that coherence misses?e.g., abdominal aortic aneurysm repair grouped with deep vein thrombosis (d = 0.600). Diagnostic signals are nearly identical across eight BERTopic embedding strategies including ontology-enhanced models, but diverge across model families: BERTopic alone produces a positive within- vs. cross-topic Cohen{'}s d, while LDA, NMF, and Top2Vec at matched topic counts score below their own cross-topic baselines (Cohen{'}s d 0; Mann?Whitney p 0.99). The score is therefore sensitive to topic-model output choice, not only to embedding choice within a single pipeline. A pre-clustering screening experiment finds near-zero correlation (|?| 0.08) between embedding cosine and SNOMED CT similarity, arguing that ontological validation belongs after clustering rather than as an embedding screen. We additionally describe a two-stage UMLS-CUI stopword filter that preserves high-frequency domain-specific concepts which naive frequency filtering would discard. After one-time concept curation, the diagnostic itself is automated and requires no per-topic expert scoring."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rubinfeld-etal-2026-ontological">
<titleInfo>
<title>Ontological Validation of Biomedical Topic Models: SNOMED CT Hierarchy Distance as an Automated Evaluation Metric</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ilan</namePart>
<namePart type="family">Rubinfeld</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sami</namePart>
<namePart type="family">Zaidi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Milosh</namePart>
<namePart type="family">Djuric</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Loay</namePart>
<namePart type="family">Kabbani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mouhammad</namePart>
<namePart type="family">Halabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Shepard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>BioNLP 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-434-7</identifier>
</relatedItem>
<abstract>Standard coherence metrics for biomedical topic models encode no clinical knowledge and cannot detect clinically implausible topic groupings. We propose SNOMED CT Wu?Palmer hierarchy distance as a post hoc, ontology-grounded diagnostic. On vascular surgery (47,318 articles) and craniofacial surgery (27,493 articles) corpora, the metric flags clinically heterogeneous topics that coherence misses?e.g., abdominal aortic aneurysm repair grouped with deep vein thrombosis (d = 0.600). Diagnostic signals are nearly identical across eight BERTopic embedding strategies including ontology-enhanced models, but diverge across model families: BERTopic alone produces a positive within- vs. cross-topic Cohen’s d, while LDA, NMF, and Top2Vec at matched topic counts score below their own cross-topic baselines (Cohen’s d 0; Mann?Whitney p 0.99). The score is therefore sensitive to topic-model output choice, not only to embedding choice within a single pipeline. A pre-clustering screening experiment finds near-zero correlation (|?| 0.08) between embedding cosine and SNOMED CT similarity, arguing that ontological validation belongs after clustering rather than as an embedding screen. We additionally describe a two-stage UMLS-CUI stopword filter that preserves high-frequency domain-specific concepts which naive frequency filtering would discard. After one-time concept curation, the diagnostic itself is automated and requires no per-topic expert scoring.</abstract>
<identifier type="citekey">rubinfeld-etal-2026-ontological</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-1.27/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>342</start>
<end>352</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Ontological Validation of Biomedical Topic Models: SNOMED CT Hierarchy Distance as an Automated Evaluation Metric
%A Rubinfeld, Ilan
%A Zaidi, Sami
%A Djuric, Milosh
%A Kabbani, Loay
%A Halabi, Mouhammad
%A Shepard, Alex
%Y Demner-Fushman, Dina
%Y Ananiadou, Sophia
%Y Roberts, Kirk
%Y Tsujii, Junichi
%S BioNLP 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California
%@ 979-8-89176-434-7
%F rubinfeld-etal-2026-ontological
%X Standard coherence metrics for biomedical topic models encode no clinical knowledge and cannot detect clinically implausible topic groupings. We propose SNOMED CT Wu?Palmer hierarchy distance as a post hoc, ontology-grounded diagnostic. On vascular surgery (47,318 articles) and craniofacial surgery (27,493 articles) corpora, the metric flags clinically heterogeneous topics that coherence misses?e.g., abdominal aortic aneurysm repair grouped with deep vein thrombosis (d = 0.600). Diagnostic signals are nearly identical across eight BERTopic embedding strategies including ontology-enhanced models, but diverge across model families: BERTopic alone produces a positive within- vs. cross-topic Cohen’s d, while LDA, NMF, and Top2Vec at matched topic counts score below their own cross-topic baselines (Cohen’s d 0; Mann?Whitney p 0.99). The score is therefore sensitive to topic-model output choice, not only to embedding choice within a single pipeline. A pre-clustering screening experiment finds near-zero correlation (|?| 0.08) between embedding cosine and SNOMED CT similarity, arguing that ontological validation belongs after clustering rather than as an embedding screen. We additionally describe a two-stage UMLS-CUI stopword filter that preserves high-frequency domain-specific concepts which naive frequency filtering would discard. After one-time concept curation, the diagnostic itself is automated and requires no per-topic expert scoring.
%U https://aclanthology.org/2026.bionlp-1.27/
%P 342-352
Markdown (Informal)
[Ontological Validation of Biomedical Topic Models: SNOMED CT Hierarchy Distance as an Automated Evaluation Metric](https://aclanthology.org/2026.bionlp-1.27/) (Rubinfeld et al., BioNLP 2026)
ACL