@inproceedings{olvera-etal-2025-iknow,
title = "i{K}now-audio: Integrating Knowledge Graphs with Audio-Language Models",
author = {Olvera, Michel and
Wang, Changhong and
Stamatiadis, Paraskevas and
Richard, Ga{\"e}l and
Essid, Slim},
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1759/",
pages = "34671--34688",
ISBN = "979-8-89176-332-6",
abstract = "Contrastive Language{--}Audio Pretraining (CLAP) models learn by aligning audio and text in a shared embedding space, enabling powerful zero-shot recognition. However, their performance is highly sensitive to prompt formulation and language nuances, and they often inherit semantic ambiguities and spurious correlations from noisy pretraining data. While prior work has explored prompt engineering, adapters, and prefix tuning to address these limitations, the use of structured prior knowledge remains largely unexplored. We present iKnow-audio, a framework that integrates knowledge graphs with audio-language models to provide robust semantic grounding. iKnow-audio builds on the Audio-centric Knowledge Graph (AKG), which encodes ontological relations comprising semantic, causal, and taxonomic connections reflective of everyday sound scenes and events. By training knowlege graph embedding models on the AKG and refining CLAP predictions through this structured knowledge, iKnow-audio improves disambiguation of acoustically similar sounds and reduces reliance on prompt engineering. Comprehensive zero-shot evaluations across six benchmark datasets demonstrate consistent gains over baseline CLAP, supported by embedding-space analyses that highlight improved relational grounding. Resources are publicly available at https://github.com/michelolzam/iknow-audio"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="olvera-etal-2025-iknow">
<titleInfo>
<title>iKnow-audio: Integrating Knowledge Graphs with Audio-Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michel</namePart>
<namePart type="family">Olvera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Changhong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paraskevas</namePart>
<namePart type="family">Stamatiadis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gaël</namePart>
<namePart type="family">Richard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slim</namePart>
<namePart type="family">Essid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Contrastive Language–Audio Pretraining (CLAP) models learn by aligning audio and text in a shared embedding space, enabling powerful zero-shot recognition. However, their performance is highly sensitive to prompt formulation and language nuances, and they often inherit semantic ambiguities and spurious correlations from noisy pretraining data. While prior work has explored prompt engineering, adapters, and prefix tuning to address these limitations, the use of structured prior knowledge remains largely unexplored. We present iKnow-audio, a framework that integrates knowledge graphs with audio-language models to provide robust semantic grounding. iKnow-audio builds on the Audio-centric Knowledge Graph (AKG), which encodes ontological relations comprising semantic, causal, and taxonomic connections reflective of everyday sound scenes and events. By training knowlege graph embedding models on the AKG and refining CLAP predictions through this structured knowledge, iKnow-audio improves disambiguation of acoustically similar sounds and reduces reliance on prompt engineering. Comprehensive zero-shot evaluations across six benchmark datasets demonstrate consistent gains over baseline CLAP, supported by embedding-space analyses that highlight improved relational grounding. Resources are publicly available at https://github.com/michelolzam/iknow-audio</abstract>
<identifier type="citekey">olvera-etal-2025-iknow</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1759/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>34671</start>
<end>34688</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T iKnow-audio: Integrating Knowledge Graphs with Audio-Language Models
%A Olvera, Michel
%A Wang, Changhong
%A Stamatiadis, Paraskevas
%A Richard, Gaël
%A Essid, Slim
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F olvera-etal-2025-iknow
%X Contrastive Language–Audio Pretraining (CLAP) models learn by aligning audio and text in a shared embedding space, enabling powerful zero-shot recognition. However, their performance is highly sensitive to prompt formulation and language nuances, and they often inherit semantic ambiguities and spurious correlations from noisy pretraining data. While prior work has explored prompt engineering, adapters, and prefix tuning to address these limitations, the use of structured prior knowledge remains largely unexplored. We present iKnow-audio, a framework that integrates knowledge graphs with audio-language models to provide robust semantic grounding. iKnow-audio builds on the Audio-centric Knowledge Graph (AKG), which encodes ontological relations comprising semantic, causal, and taxonomic connections reflective of everyday sound scenes and events. By training knowlege graph embedding models on the AKG and refining CLAP predictions through this structured knowledge, iKnow-audio improves disambiguation of acoustically similar sounds and reduces reliance on prompt engineering. Comprehensive zero-shot evaluations across six benchmark datasets demonstrate consistent gains over baseline CLAP, supported by embedding-space analyses that highlight improved relational grounding. Resources are publicly available at https://github.com/michelolzam/iknow-audio
%U https://aclanthology.org/2025.emnlp-main.1759/
%P 34671-34688
Markdown (Informal)
[iKnow-audio: Integrating Knowledge Graphs with Audio-Language Models](https://aclanthology.org/2025.emnlp-main.1759/) (Olvera et al., EMNLP 2025)
ACL