@inproceedings{hromei-etal-2025-grounded,
title = "Grounded Semantic Role Labelling from Synthetic Multimodal Data for Situated Robot Commands",
author = "Hromei, Claudiu Daniel and
Scaiella, Antonio and
Croce, Danilo and
Basili, Roberto",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1212/",
doi = "10.18653/v1/2025.emnlp-main.1212",
pages = "23747--23770",
ISBN = "979-8-89176-332-6",
abstract = "Understanding natural language commands in situated Human-Robot Interaction (HRI) requires linking linguistic input to perceptual context. Traditional symbolic parsers lack the flexibility to operate in complex, dynamic environments. We introduce a novel Multimodal Grounded Semantic Role Labelling (G-SRL) framework that combines frame semantics with perceptual grounding, enabling robots to interpret commands via multimodal logical forms. Our approach leverages modern Visual Language Models (VLLMs), which jointly process text and images, and is supported by an automated pipeline that generates high-quality training data. Structured command annotations are converted into photorealistic scenes via LLM-guided prompt engineering and diffusion models, then rigorously validated through object detection and visual question answering. The pipeline produces over 11,000 image-command pairs (3,500+ manually validated), while approaching the quality of manually curated datasets at significantly lower cost."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hromei-etal-2025-grounded">
<titleInfo>
<title>Grounded Semantic Role Labelling from Synthetic Multimodal Data for Situated Robot Commands</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claudiu</namePart>
<namePart type="given">Daniel</namePart>
<namePart type="family">Hromei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Scaiella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danilo</namePart>
<namePart type="family">Croce</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Basili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Understanding natural language commands in situated Human-Robot Interaction (HRI) requires linking linguistic input to perceptual context. Traditional symbolic parsers lack the flexibility to operate in complex, dynamic environments. We introduce a novel Multimodal Grounded Semantic Role Labelling (G-SRL) framework that combines frame semantics with perceptual grounding, enabling robots to interpret commands via multimodal logical forms. Our approach leverages modern Visual Language Models (VLLMs), which jointly process text and images, and is supported by an automated pipeline that generates high-quality training data. Structured command annotations are converted into photorealistic scenes via LLM-guided prompt engineering and diffusion models, then rigorously validated through object detection and visual question answering. The pipeline produces over 11,000 image-command pairs (3,500+ manually validated), while approaching the quality of manually curated datasets at significantly lower cost.</abstract>
<identifier type="citekey">hromei-etal-2025-grounded</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1212</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1212/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>23747</start>
<end>23770</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Grounded Semantic Role Labelling from Synthetic Multimodal Data for Situated Robot Commands
%A Hromei, Claudiu Daniel
%A Scaiella, Antonio
%A Croce, Danilo
%A Basili, Roberto
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F hromei-etal-2025-grounded
%X Understanding natural language commands in situated Human-Robot Interaction (HRI) requires linking linguistic input to perceptual context. Traditional symbolic parsers lack the flexibility to operate in complex, dynamic environments. We introduce a novel Multimodal Grounded Semantic Role Labelling (G-SRL) framework that combines frame semantics with perceptual grounding, enabling robots to interpret commands via multimodal logical forms. Our approach leverages modern Visual Language Models (VLLMs), which jointly process text and images, and is supported by an automated pipeline that generates high-quality training data. Structured command annotations are converted into photorealistic scenes via LLM-guided prompt engineering and diffusion models, then rigorously validated through object detection and visual question answering. The pipeline produces over 11,000 image-command pairs (3,500+ manually validated), while approaching the quality of manually curated datasets at significantly lower cost.
%R 10.18653/v1/2025.emnlp-main.1212
%U https://aclanthology.org/2025.emnlp-main.1212/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1212
%P 23747-23770
Markdown (Informal)
[Grounded Semantic Role Labelling from Synthetic Multimodal Data for Situated Robot Commands](https://aclanthology.org/2025.emnlp-main.1212/) (Hromei et al., EMNLP 2025)
ACL