@inproceedings{daniilidou-etal-2026-social,
title = "When social robots see our sketches: evaluating human perception of a robot and a {VLM} model performance in a drawing task",
author = "Daniilidou, Viktoria Paraskevi and
Ilinykh, Nikolai and
Maraev, Vladislav",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.iwsds-1.25/",
pages = "239--252",
abstract = "We introduce a multimodal framework for interactive drawing in a robot-assisted second language learning scenario. In this scenario, humans are asked to draw objects and spatial relations between them, while a social robot that uses a vision-language model ({VLM}) to analyse whether the drawings are correct.{T}he correctness decision that is passed to the human is coming from a {W}izard-of-{O}z ({W}o{Z}) setup. Therefore, we use it to indirectly evaluate the quality of {VLM} predictions. We show that the task is very challenging for a {VLM} and approaching evaluation of {VLM} performance is important: focusing on the correctness of prediction of certain features (objects, relations) provides a different evaluation picture from when the model is evaluated on prediction of the content of the image as a whole. We also examine how the appearance of the social agent and the type of feedback influence perception of the agent by the participants through a questionnaire. The comparison of verbal feedback, generated by the large language models, against simple pattern-based feedback did not show any significant effects whereas the robot{'}s appearance change indicated significant difference in user ratings concerning naturalness of the agent and its social presence."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="daniilidou-etal-2026-social">
<titleInfo>
<title>When social robots see our sketches: evaluating human perception of a robot and a VLM model performance in a drawing task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Viktoria</namePart>
<namePart type="given">Paraskevi</namePart>
<namePart type="family">Daniilidou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Maraev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-02</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Spoken Dialogue System Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Riccardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Mahed</namePart>
<namePart type="family">Mousavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Ines</namePart>
<namePart type="family">Torres</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koichiro</namePart>
<namePart type="family">Yoshino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zoraida</namePart>
<namePart type="family">Callejas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="given">Absar</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joakim</namePart>
<namePart type="family">Gustafson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Papangelis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="given">Fernando</namePart>
<namePart type="family">D’Haro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Mendonça</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raffaella</namePart>
<namePart type="family">Bernardi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dilek</namePart>
<namePart type="family">Hakkani-Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="given">”Pino”</namePart>
<namePart type="family">Di Fabbrizio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Kawahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gokhan</namePart>
<namePart type="family">Tur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Johnston</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Trento, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce a multimodal framework for interactive drawing in a robot-assisted second language learning scenario. In this scenario, humans are asked to draw objects and spatial relations between them, while a social robot that uses a vision-language model (VLM) to analyse whether the drawings are correct.The correctness decision that is passed to the human is coming from a Wizard-of-Oz (WoZ) setup. Therefore, we use it to indirectly evaluate the quality of VLM predictions. We show that the task is very challenging for a VLM and approaching evaluation of VLM performance is important: focusing on the correctness of prediction of certain features (objects, relations) provides a different evaluation picture from when the model is evaluated on prediction of the content of the image as a whole. We also examine how the appearance of the social agent and the type of feedback influence perception of the agent by the participants through a questionnaire. The comparison of verbal feedback, generated by the large language models, against simple pattern-based feedback did not show any significant effects whereas the robot’s appearance change indicated significant difference in user ratings concerning naturalness of the agent and its social presence.</abstract>
<identifier type="citekey">daniilidou-etal-2026-social</identifier>
<location>
<url>https://aclanthology.org/2026.iwsds-1.25/</url>
</location>
<part>
<date>2026-02</date>
<extent unit="page">
<start>239</start>
<end>252</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When social robots see our sketches: evaluating human perception of a robot and a VLM model performance in a drawing task
%A Daniilidou, Viktoria Paraskevi
%A Ilinykh, Nikolai
%A Maraev, Vladislav
%Y Riccardi, Giuseppe
%Y Mousavi, Seyed Mahed
%Y Torres, Maria Ines
%Y Yoshino, Koichiro
%Y Callejas, Zoraida
%Y Chowdhury, Shammur Absar
%Y Chen, Yun-Nung
%Y Bechet, Frederic
%Y Gustafson, Joakim
%Y Damnati, Géraldine
%Y Papangelis, Alex
%Y D’Haro, Luis Fernando
%Y Mendonça, John
%Y Bernardi, Raffaella
%Y Hakkani-Tur, Dilek
%Y Di Fabbrizio, Giuseppe ”Pino”
%Y Kawahara, Tatsuya
%Y Alam, Firoj
%Y Tur, Gokhan
%Y Johnston, Michael
%S Proceedings of the 16th International Workshop on Spoken Dialogue System Technology
%D 2026
%8 February
%I Association for Computational Linguistics
%C Trento, Italy
%F daniilidou-etal-2026-social
%X We introduce a multimodal framework for interactive drawing in a robot-assisted second language learning scenario. In this scenario, humans are asked to draw objects and spatial relations between them, while a social robot that uses a vision-language model (VLM) to analyse whether the drawings are correct.The correctness decision that is passed to the human is coming from a Wizard-of-Oz (WoZ) setup. Therefore, we use it to indirectly evaluate the quality of VLM predictions. We show that the task is very challenging for a VLM and approaching evaluation of VLM performance is important: focusing on the correctness of prediction of certain features (objects, relations) provides a different evaluation picture from when the model is evaluated on prediction of the content of the image as a whole. We also examine how the appearance of the social agent and the type of feedback influence perception of the agent by the participants through a questionnaire. The comparison of verbal feedback, generated by the large language models, against simple pattern-based feedback did not show any significant effects whereas the robot’s appearance change indicated significant difference in user ratings concerning naturalness of the agent and its social presence.
%U https://aclanthology.org/2026.iwsds-1.25/
%P 239-252
Markdown (Informal)
[When social robots see our sketches: evaluating human perception of a robot and a VLM model performance in a drawing task](https://aclanthology.org/2026.iwsds-1.25/) (Daniilidou et al., IWSDS 2026)
ACL