@inproceedings{lukin-etal-2026-non,
title = "Non-Event Oriented Video Assessments in Long-Form Robot Videos",
author = "Lukin, Stephanie M. and
Pollard, Kimberly A. and
Bonial, Claire and
Hayes, Cory J. and
Artstein, Ron and
Georgila, Kallirroi and
Traum, David",
editor = "Murray, Kenton and
Kriz, Reno",
booktitle = "Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval ({MAGM}a{R} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.magmar-main.8/",
pages = "27--41",
ISBN = "979-8-89176-425-5",
abstract = "We introduce Video-SCOUT, a novel dataset of sixty 20-minute robot-recorded videos from human-robot collaborative exploration exercises, together with a new video analysis method for these types of exploration videos. Unlike video from stationary cameras where detection of motion can help identify events of interest, the camera in an exploration task is constantly in motion while the environment is stationary. Our analysis method{---}Non-Event Oriented Video Assessments (NOVA){---}uses vision-language models to select frames relevant for supporting a particular assessment within continuous long-form videos. Results of testing with two different video-language models reveals a trade-off in precision and recall, and exhibits gains in overall recall when combined with a human{'}s knowledge, suggesting that NOVA may improve a human analysis of robot-navigation. We outline future work to mitigate miscommunication in human-robot interaction by leveraging dialogue with NOVA in support of better collaboration."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lukin-etal-2026-non">
<titleInfo>
<title>Non-Event Oriented Video Assessments in Long-Form Robot Videos</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stephanie</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Lukin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kimberly</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Pollard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cory</namePart>
<namePart type="given">J</namePart>
<namePart type="family">Hayes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ron</namePart>
<namePart type="family">Artstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kallirroi</namePart>
<namePart type="family">Georgila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kenton</namePart>
<namePart type="family">Murray</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reno</namePart>
<namePart type="family">Kriz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-425-5</identifier>
</relatedItem>
<abstract>We introduce Video-SCOUT, a novel dataset of sixty 20-minute robot-recorded videos from human-robot collaborative exploration exercises, together with a new video analysis method for these types of exploration videos. Unlike video from stationary cameras where detection of motion can help identify events of interest, the camera in an exploration task is constantly in motion while the environment is stationary. Our analysis method—Non-Event Oriented Video Assessments (NOVA)—uses vision-language models to select frames relevant for supporting a particular assessment within continuous long-form videos. Results of testing with two different video-language models reveals a trade-off in precision and recall, and exhibits gains in overall recall when combined with a human’s knowledge, suggesting that NOVA may improve a human analysis of robot-navigation. We outline future work to mitigate miscommunication in human-robot interaction by leveraging dialogue with NOVA in support of better collaboration.</abstract>
<identifier type="citekey">lukin-etal-2026-non</identifier>
<location>
<url>https://aclanthology.org/2026.magmar-main.8/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27</start>
<end>41</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Non-Event Oriented Video Assessments in Long-Form Robot Videos
%A Lukin, Stephanie M.
%A Pollard, Kimberly A.
%A Bonial, Claire
%A Hayes, Cory J.
%A Artstein, Ron
%A Georgila, Kallirroi
%A Traum, David
%Y Murray, Kenton
%Y Kriz, Reno
%S Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA
%@ 979-8-89176-425-5
%F lukin-etal-2026-non
%X We introduce Video-SCOUT, a novel dataset of sixty 20-minute robot-recorded videos from human-robot collaborative exploration exercises, together with a new video analysis method for these types of exploration videos. Unlike video from stationary cameras where detection of motion can help identify events of interest, the camera in an exploration task is constantly in motion while the environment is stationary. Our analysis method—Non-Event Oriented Video Assessments (NOVA)—uses vision-language models to select frames relevant for supporting a particular assessment within continuous long-form videos. Results of testing with two different video-language models reveals a trade-off in precision and recall, and exhibits gains in overall recall when combined with a human’s knowledge, suggesting that NOVA may improve a human analysis of robot-navigation. We outline future work to mitigate miscommunication in human-robot interaction by leveraging dialogue with NOVA in support of better collaboration.
%U https://aclanthology.org/2026.magmar-main.8/
%P 27-41
Markdown (Informal)
[Non-Event Oriented Video Assessments in Long-Form Robot Videos](https://aclanthology.org/2026.magmar-main.8/) (Lukin et al., MAGMaR 2026)
ACL
- Stephanie M. Lukin, Kimberly A. Pollard, Claire Bonial, Cory J. Hayes, Ron Artstein, Kallirroi Georgila, and David Traum. 2026. Non-Event Oriented Video Assessments in Long-Form Robot Videos. In Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2026), pages 27–41, San Diego, USA. Association for Computational Linguistics.