@article{baier-etal-2025-modular,
title = "A modular architecture for creating multimodal embodied agents with an episodic Knowledge Graph as an explainable and controllable long-term memory",
author = "Baier, Thomas and
Santamar{\'i}a, Selene B{\'a}ez and
Vossen, Piek",
editor = "Zeldes, Amir and
Stede, Manfred and
Healey, Patrick G.T. and
and Hendrik Buschmeier",
journal = "Dialogue {\&} Discourse",
volume = "16",
month = dec,
year = "2025",
address = "Chicago, Illinois, USA",
publisher = "University of Illinois Chicago",
url = "https://aclanthology.org/2025.dnd-16.11/",
doi = "10.5210/dad.2025.303",
pages = "25--59",
abstract = "How can flexibility and control over the interpretation of multimodal signals by embodied agents be balanced? Flexibility means that agents respond fluently in any context, whereas control means that responses are transparent and faithful to goals and principles that are explicitly defined. This paper describes a modular platform to create multimodal interactive agents using an event bus on which signals and interpretations are posted as a sequence in time, but also provides control options to drive the interaction given specific intentions and goals. Different sensors and interpretation components can be integrated by defining their input and output topics in the event bus, which results in an open multimodal sequence-driven workflow for further interpretations. In addition, our platform allows us to define higher-level intents that control sequence patterns to achieve a goal. A key component is an episodic Knowledge Graph (eKG) that acts as a long-term symbolic memory to aggregate and connect these interpretations. This eKG establishes coherence and continuity across different interactions. Intents and the eKG make it possible to define different (embodied) agents and compare their behavior without having to implement complex software components for multimodal sensor data and design the control over their dependencies. In this paper, we explain the broad range of components that we developed and integrated into various interactive agents. We also explain how the interaction is recorded as multimodal data and how it results in an aggregated memory in the eKG. By analyzing the recorded interaction, we can compare agents and agent components and study their interactive behavior with people and other agents."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="baier-etal-2025-modular">
<titleInfo>
<title>A modular architecture for creating multimodal embodied agents with an episodic Knowledge Graph as an explainable and controllable long-term memory</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Baier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Selene</namePart>
<namePart type="given">Báez</namePart>
<namePart type="family">Santamaría</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Piek</namePart>
<namePart type="family">Vossen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Dialogue & Discourse</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>University of Illinois Chicago</publisher>
<place>
<placeTerm type="text">Chicago, Illinois, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>How can flexibility and control over the interpretation of multimodal signals by embodied agents be balanced? Flexibility means that agents respond fluently in any context, whereas control means that responses are transparent and faithful to goals and principles that are explicitly defined. This paper describes a modular platform to create multimodal interactive agents using an event bus on which signals and interpretations are posted as a sequence in time, but also provides control options to drive the interaction given specific intentions and goals. Different sensors and interpretation components can be integrated by defining their input and output topics in the event bus, which results in an open multimodal sequence-driven workflow for further interpretations. In addition, our platform allows us to define higher-level intents that control sequence patterns to achieve a goal. A key component is an episodic Knowledge Graph (eKG) that acts as a long-term symbolic memory to aggregate and connect these interpretations. This eKG establishes coherence and continuity across different interactions. Intents and the eKG make it possible to define different (embodied) agents and compare their behavior without having to implement complex software components for multimodal sensor data and design the control over their dependencies. In this paper, we explain the broad range of components that we developed and integrated into various interactive agents. We also explain how the interaction is recorded as multimodal data and how it results in an aggregated memory in the eKG. By analyzing the recorded interaction, we can compare agents and agent components and study their interactive behavior with people and other agents.</abstract>
<identifier type="citekey">baier-etal-2025-modular</identifier>
<identifier type="doi">10.5210/dad.2025.303</identifier>
<location>
<url>https://aclanthology.org/2025.dnd-16.11/</url>
</location>
<part>
<date>2025-12</date>
<detail type="volume"><number>16</number></detail>
<extent unit="page">
<start>25</start>
<end>59</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T A modular architecture for creating multimodal embodied agents with an episodic Knowledge Graph as an explainable and controllable long-term memory
%A Baier, Thomas
%A Santamaría, Selene Báez
%A Vossen, Piek
%J Dialogue & Discourse
%D 2025
%8 December
%V 16
%I University of Illinois Chicago
%C Chicago, Illinois, USA
%F baier-etal-2025-modular
%X How can flexibility and control over the interpretation of multimodal signals by embodied agents be balanced? Flexibility means that agents respond fluently in any context, whereas control means that responses are transparent and faithful to goals and principles that are explicitly defined. This paper describes a modular platform to create multimodal interactive agents using an event bus on which signals and interpretations are posted as a sequence in time, but also provides control options to drive the interaction given specific intentions and goals. Different sensors and interpretation components can be integrated by defining their input and output topics in the event bus, which results in an open multimodal sequence-driven workflow for further interpretations. In addition, our platform allows us to define higher-level intents that control sequence patterns to achieve a goal. A key component is an episodic Knowledge Graph (eKG) that acts as a long-term symbolic memory to aggregate and connect these interpretations. This eKG establishes coherence and continuity across different interactions. Intents and the eKG make it possible to define different (embodied) agents and compare their behavior without having to implement complex software components for multimodal sensor data and design the control over their dependencies. In this paper, we explain the broad range of components that we developed and integrated into various interactive agents. We also explain how the interaction is recorded as multimodal data and how it results in an aggregated memory in the eKG. By analyzing the recorded interaction, we can compare agents and agent components and study their interactive behavior with people and other agents.
%R 10.5210/dad.2025.303
%U https://aclanthology.org/2025.dnd-16.11/
%U https://doi.org/10.5210/dad.2025.303
%P 25-59
Markdown (Informal)
[A modular architecture for creating multimodal embodied agents with an episodic Knowledge Graph as an explainable and controllable long-term memory](https://aclanthology.org/2025.dnd-16.11/) (Baier et al., DND 2025)
ACL