@inproceedings{rice-etal-2025-egodrive,
title = "{E}go{D}rive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria",
author = "Rice, Michael and
Krause, Lorenz and
Qureshi, Waqar Shahid",
editor = "Acarturk, Cengiz and
Nasir, Jamal and
Can, Burcu and
Coltekin, Cagr{\i}",
booktitle = "Proceedings of the First International Workshop on Gaze Data and Natural Language Processing",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, BULGARIA",
url = "https://aclanthology.org/2025.gaze4nlp-1.3/",
pages = "18--25",
abstract = "Egocentric sensing using wearable devices of- fers a unique first-person perspective for driver behavior analysis and monitoring, with the po- tential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver{'}s view- point. In this paper, we introduce a multimodal driver behavior recognition framework utilizing Meta{'}s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution RGB video, gaze- tracking data, inertial IMU signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are tempo- rally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: Driving, Left Mirror Check, Right Wing Mirror Check, Rear- view Mirror Check, Mobile Phone Usage, and Idle. We design a Transformer-based recog- nition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to cap- ture cross-modal temporal dependencies. To in- vestigate the trade-off between accuracy and ef- ficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, de- signed for real-time performance. These mod- els achieve classification accuracies of 98.6{\%} and 97.4{\%} respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a spe- cialized GPU{---}highlighting its potential for efficient, real-time in-cabin driver monitoring."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rice-etal-2025-egodrive">
<titleInfo>
<title>EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Rice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lorenz</namePart>
<namePart type="family">Krause</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waqar</namePart>
<namePart type="given">Shahid</namePart>
<namePart type="family">Qureshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First International Workshop on Gaze Data and Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cengiz</namePart>
<namePart type="family">Acarturk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jamal</namePart>
<namePart type="family">Nasir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Burcu</namePart>
<namePart type="family">Can</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cagrı</namePart>
<namePart type="family">Coltekin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, BULGARIA</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Egocentric sensing using wearable devices of- fers a unique first-person perspective for driver behavior analysis and monitoring, with the po- tential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver’s view- point. In this paper, we introduce a multimodal driver behavior recognition framework utilizing Meta’s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution RGB video, gaze- tracking data, inertial IMU signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are tempo- rally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: Driving, Left Mirror Check, Right Wing Mirror Check, Rear- view Mirror Check, Mobile Phone Usage, and Idle. We design a Transformer-based recog- nition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to cap- ture cross-modal temporal dependencies. To in- vestigate the trade-off between accuracy and ef- ficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, de- signed for real-time performance. These mod- els achieve classification accuracies of 98.6% and 97.4% respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a spe- cialized GPU—highlighting its potential for efficient, real-time in-cabin driver monitoring.</abstract>
<identifier type="citekey">rice-etal-2025-egodrive</identifier>
<location>
<url>https://aclanthology.org/2025.gaze4nlp-1.3/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>18</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria
%A Rice, Michael
%A Krause, Lorenz
%A Qureshi, Waqar Shahid
%Y Acarturk, Cengiz
%Y Nasir, Jamal
%Y Can, Burcu
%Y Coltekin, Cagrı
%S Proceedings of the First International Workshop on Gaze Data and Natural Language Processing
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, BULGARIA
%C Varna, Bulgaria
%F rice-etal-2025-egodrive
%X Egocentric sensing using wearable devices of- fers a unique first-person perspective for driver behavior analysis and monitoring, with the po- tential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver’s view- point. In this paper, we introduce a multimodal driver behavior recognition framework utilizing Meta’s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution RGB video, gaze- tracking data, inertial IMU signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are tempo- rally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: Driving, Left Mirror Check, Right Wing Mirror Check, Rear- view Mirror Check, Mobile Phone Usage, and Idle. We design a Transformer-based recog- nition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to cap- ture cross-modal temporal dependencies. To in- vestigate the trade-off between accuracy and ef- ficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, de- signed for real-time performance. These mod- els achieve classification accuracies of 98.6% and 97.4% respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a spe- cialized GPU—highlighting its potential for efficient, real-time in-cabin driver monitoring.
%U https://aclanthology.org/2025.gaze4nlp-1.3/
%P 18-25
Markdown (Informal)
[EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria](https://aclanthology.org/2025.gaze4nlp-1.3/) (Rice et al., Gaze4NLP 2025)
ACL