@inproceedings{hromei-etal-2025-training,
title = "Training Multi-Modal {LLM}s through Dialogue Planning for {HRI}",
author = "Hromei, Claudiu Daniel and
Borazio, Federico and
Sensi, Andrea and
Passone, Elisa and
Croce, Danilo and
Basili, Roberto",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.837/",
doi = "10.18653/v1/2025.findings-acl.837",
pages = "16266--16284",
ISBN = "979-8-89176-256-5",
abstract = "Grounded natural language understanding in Human-Robot Interaction (HRI) requires integrating linguistic, visual, and world knowledge to ensure effective task execution. We propose an approach that enhances Multi-Modal Large Language Models (MLLMs) with a novel explicit dialogue planning phase, allowing robotic agents to systematically refine their understanding of ambiguous commands through structured clarification steps. This reduces hallucinations and improves task feasibility.To evaluate this approach, we introduce a novel dataset of over 1,100 annotated dialogues in English and Italian, designed for fine-tuning and assessing Multi-Modal models in HRI scenarios. Experimental results show that dialogue planning improves response accuracy and quality, and contributes to cross-lingual generalisation, enabling models trained in one language to transfer effectively to another. To the best of our knowledge, this is the first application of structured, goal-driven, and explicit dialogue planning in Multi-Modal LLMs for grounded interaction."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hromei-etal-2025-training">
<titleInfo>
<title>Training Multi-Modal LLMs through Dialogue Planning for HRI</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudiu</namePart>
<namePart type="given">Daniel</namePart>
<namePart type="family">Hromei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Borazio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Sensi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elisa</namePart>
<namePart type="family">Passone</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danilo</namePart>
<namePart type="family">Croce</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Basili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Grounded natural language understanding in Human-Robot Interaction (HRI) requires integrating linguistic, visual, and world knowledge to ensure effective task execution. We propose an approach that enhances Multi-Modal Large Language Models (MLLMs) with a novel explicit dialogue planning phase, allowing robotic agents to systematically refine their understanding of ambiguous commands through structured clarification steps. This reduces hallucinations and improves task feasibility.To evaluate this approach, we introduce a novel dataset of over 1,100 annotated dialogues in English and Italian, designed for fine-tuning and assessing Multi-Modal models in HRI scenarios. Experimental results show that dialogue planning improves response accuracy and quality, and contributes to cross-lingual generalisation, enabling models trained in one language to transfer effectively to another. To the best of our knowledge, this is the first application of structured, goal-driven, and explicit dialogue planning in Multi-Modal LLMs for grounded interaction.</abstract>
<identifier type="citekey">hromei-etal-2025-training</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.837</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.837/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>16266</start>
<end>16284</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training Multi-Modal LLMs through Dialogue Planning for HRI
%A Hromei, Claudiu Daniel
%A Borazio, Federico
%A Sensi, Andrea
%A Passone, Elisa
%A Croce, Danilo
%A Basili, Roberto
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F hromei-etal-2025-training
%X Grounded natural language understanding in Human-Robot Interaction (HRI) requires integrating linguistic, visual, and world knowledge to ensure effective task execution. We propose an approach that enhances Multi-Modal Large Language Models (MLLMs) with a novel explicit dialogue planning phase, allowing robotic agents to systematically refine their understanding of ambiguous commands through structured clarification steps. This reduces hallucinations and improves task feasibility.To evaluate this approach, we introduce a novel dataset of over 1,100 annotated dialogues in English and Italian, designed for fine-tuning and assessing Multi-Modal models in HRI scenarios. Experimental results show that dialogue planning improves response accuracy and quality, and contributes to cross-lingual generalisation, enabling models trained in one language to transfer effectively to another. To the best of our knowledge, this is the first application of structured, goal-driven, and explicit dialogue planning in Multi-Modal LLMs for grounded interaction.
%R 10.18653/v1/2025.findings-acl.837
%U https://aclanthology.org/2025.findings-acl.837/
%U https://doi.org/10.18653/v1/2025.findings-acl.837
%P 16266-16284
Markdown (Informal)
[Training Multi-Modal LLMs through Dialogue Planning for HRI](https://aclanthology.org/2025.findings-acl.837/) (Hromei et al., Findings 2025)
ACL
- Claudiu Daniel Hromei, Federico Borazio, Andrea Sensi, Elisa Passone, Danilo Croce, and Roberto Basili. 2025. Training Multi-Modal LLMs through Dialogue Planning for HRI. In Findings of the Association for Computational Linguistics: ACL 2025, pages 16266–16284, Vienna, Austria. Association for Computational Linguistics.