@inproceedings{rice-etal-2025-egodrive,
title = "{E}go{D}rive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria",
author = "Rice, Michael and
Krause, Lorenz and
Qureshi, Waqar Shahid",
editor = "Acarturk, Cengiz and
Nasir, Jamal and
Can, Burcu and
Coltekin, Cagr{\i}",
booktitle = "Proceedings of the First International Workshop on Gaze Data and Natural Language Processing",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, BULGARIA",
url = "https://aclanthology.org/2025.gaze4nlp-1.3/",
pages = "18--25",
abstract = "Egocentric sensing using wearable devices offers a unique first-person perspective for driver behaviour analysis and monitoring, with the potential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver{'}s viewpoint. In this paper, we introduce a multimodal driver behaviour recognition framework utilizing Meta{'}s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution Red Green Blue (RGB) video, gaze-tracking data, Inertial Measurement Unit (IMU) signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are temporally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: \textit{Driving}, \textit{Left Mirror Check}, \textit{Right Wing Mirror Check}, \textit{Rear-view Mirror Check}, \textit{Mobile Phone Usage}, and \textit{Idle}. We design a Transformer-based recognition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to capture cross-modal temporal dependencies. To investigate the trade-off between accuracy and efficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, designed for real-time performance. These models achieve classification accuracies of 98.6{\%} and 97.4{\%} respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a specialized GPU{---}highlighting its potential for efficient, real-time in-cabin driver monitoring."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rice-etal-2025-egodrive">
<titleInfo>
<title>EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Rice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lorenz</namePart>
<namePart type="family">Krause</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waqar</namePart>
<namePart type="given">Shahid</namePart>
<namePart type="family">Qureshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First International Workshop on Gaze Data and Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cengiz</namePart>
<namePart type="family">Acarturk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jamal</namePart>
<namePart type="family">Nasir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Burcu</namePart>
<namePart type="family">Can</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cagrı</namePart>
<namePart type="family">Coltekin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, BULGARIA</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Egocentric sensing using wearable devices offers a unique first-person perspective for driver behaviour analysis and monitoring, with the potential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver’s viewpoint. In this paper, we introduce a multimodal driver behaviour recognition framework utilizing Meta’s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution Red Green Blue (RGB) video, gaze-tracking data, Inertial Measurement Unit (IMU) signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are temporally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: Driving, Left Mirror Check, Right Wing Mirror Check, Rear-view Mirror Check, Mobile Phone Usage, and Idle. We design a Transformer-based recognition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to capture cross-modal temporal dependencies. To investigate the trade-off between accuracy and efficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, designed for real-time performance. These models achieve classification accuracies of 98.6% and 97.4% respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a specialized GPU—highlighting its potential for efficient, real-time in-cabin driver monitoring.</abstract>
<identifier type="citekey">rice-etal-2025-egodrive</identifier>
<location>
<url>https://aclanthology.org/2025.gaze4nlp-1.3/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>18</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria
%A Rice, Michael
%A Krause, Lorenz
%A Qureshi, Waqar Shahid
%Y Acarturk, Cengiz
%Y Nasir, Jamal
%Y Can, Burcu
%Y Coltekin, Cagrı
%S Proceedings of the First International Workshop on Gaze Data and Natural Language Processing
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, BULGARIA
%C Varna, Bulgaria
%F rice-etal-2025-egodrive
%X Egocentric sensing using wearable devices offers a unique first-person perspective for driver behaviour analysis and monitoring, with the potential to accurately capture rich multimodal cues such as eye gaze, head motion, and hand activity directly from the driver’s viewpoint. In this paper, we introduce a multimodal driver behaviour recognition framework utilizing Meta’s Project Aria smart glasses, along with a novel, synchronized egocentric driving dataset comprising high-resolution Red Green Blue (RGB) video, gaze-tracking data, Inertial Measurement Unit (IMU) signals, hand pose landmarks, and YOLO-based semantic object detections. All sensor data streams are temporally aligned and segmented into fixed-length clips, each manually annotated with one of six distinct driver behavior classes: Driving, Left Mirror Check, Right Wing Mirror Check, Rear-view Mirror Check, Mobile Phone Usage, and Idle. We design a Transformer-based recognition framework in which each modality is processed by a specialized encoder and then fused via Temporal Transformer layers to capture cross-modal temporal dependencies. To investigate the trade-off between accuracy and efficiency for real-time deployment, we introduce two model variants: EgoDriveMax, optimized for maximum accuracy, and EgoDriveRT, designed for real-time performance. These models achieve classification accuracies of 98.6% and 97.4% respectively. Notably, EgoDriveRT delivers strong performance despite operating with only 104K parameters and requiring just 2.65 ms per inference without the use of a specialized GPU—highlighting its potential for efficient, real-time in-cabin driver monitoring.
%U https://aclanthology.org/2025.gaze4nlp-1.3/
%P 18-25
Markdown (Informal)
[EgoDrive: Egocentric Multimodal Driver Behavior Recognition Using Project Aria](https://aclanthology.org/2025.gaze4nlp-1.3/) (Rice et al., Gaze4NLP 2025)
ACL