@inproceedings{lee-etal-2025-multimodal-approaches,
title = "Multimodal Approaches for Stress Recognition: A Comparative Study Using the {S}tress{ID} Dataset",
author = "Lee, Chia-Yun and
Pleva, Mat{\'u}{\v{s}} and
Hladek, Daniel and
Su, Ming-Hsiang",
editor = "Chang, Kai-Wei and
Lu, Ke-Han and
Yang, Chih-Kai and
Tam, Zhi-Rui and
Chang, Wen-Yu and
Wang, Chung-Che",
booktitle = "Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)",
month = nov,
year = "2025",
address = "National Taiwan University, Taipei City, Taiwan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.rocling-main.4/",
pages = "29--34",
ISBN = "979-8-89176-379-1",
abstract = "Mental health concerns have garnered increasing attention, highlighting the importance of timely and accurate identification of individual stress states as a critical research domain. This study employs the multimodal StressID dataset to evaluate the contributions of three modalities{---}physiological signals, video, and audio{---}in stress recognition tasks. A set of machine learning models, including Random Forests (RF), Support Vector Machines (SVM), Multi-Layer Perceptrons (MLP), and K-Nearest Neighbors (KNN), were trained and tested with optimized parameters for each modality. In addition, the effectiveness of different multimodal fusion strategies was systematically examined. The unimodal experiments revealed that the physiological modality achieved the highest performance in the binary stress classification task (F1-score = 0.751), whereas the audio modality outperformed the others in the three-class classification task (F1-score = 0.625). In the multimodal setting, feature-level fusion yielded stable improvements in the binary classification task, while decision-level fusion achieved superior performance in the three-class classification task (F1-score = 0.65). These findings demonstrate that multimodal integration can substantially enhance the accuracy of stress recognition. Future research directions include incorporating temporal modeling and addressing data imbalance to further improve the robustness and applicability of stress recognition systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2025-multimodal-approaches">
<titleInfo>
<title>Multimodal Approaches for Stress Recognition: A Comparative Study Using the StressID Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chia-Yun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matúš</namePart>
<namePart type="family">Pleva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Hladek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming-Hsiang</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke-Han</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chih-Kai</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhi-Rui</namePart>
<namePart type="family">Tam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen-Yu</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chung-Che</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">National Taiwan University, Taipei City, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-379-1</identifier>
</relatedItem>
<abstract>Mental health concerns have garnered increasing attention, highlighting the importance of timely and accurate identification of individual stress states as a critical research domain. This study employs the multimodal StressID dataset to evaluate the contributions of three modalities—physiological signals, video, and audio—in stress recognition tasks. A set of machine learning models, including Random Forests (RF), Support Vector Machines (SVM), Multi-Layer Perceptrons (MLP), and K-Nearest Neighbors (KNN), were trained and tested with optimized parameters for each modality. In addition, the effectiveness of different multimodal fusion strategies was systematically examined. The unimodal experiments revealed that the physiological modality achieved the highest performance in the binary stress classification task (F1-score = 0.751), whereas the audio modality outperformed the others in the three-class classification task (F1-score = 0.625). In the multimodal setting, feature-level fusion yielded stable improvements in the binary classification task, while decision-level fusion achieved superior performance in the three-class classification task (F1-score = 0.65). These findings demonstrate that multimodal integration can substantially enhance the accuracy of stress recognition. Future research directions include incorporating temporal modeling and addressing data imbalance to further improve the robustness and applicability of stress recognition systems.</abstract>
<identifier type="citekey">lee-etal-2025-multimodal-approaches</identifier>
<location>
<url>https://aclanthology.org/2025.rocling-main.4/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>29</start>
<end>34</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Approaches for Stress Recognition: A Comparative Study Using the StressID Dataset
%A Lee, Chia-Yun
%A Pleva, Matúš
%A Hladek, Daniel
%A Su, Ming-Hsiang
%Y Chang, Kai-Wei
%Y Lu, Ke-Han
%Y Yang, Chih-Kai
%Y Tam, Zhi-Rui
%Y Chang, Wen-Yu
%Y Wang, Chung-Che
%S Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C National Taiwan University, Taipei City, Taiwan
%@ 979-8-89176-379-1
%F lee-etal-2025-multimodal-approaches
%X Mental health concerns have garnered increasing attention, highlighting the importance of timely and accurate identification of individual stress states as a critical research domain. This study employs the multimodal StressID dataset to evaluate the contributions of three modalities—physiological signals, video, and audio—in stress recognition tasks. A set of machine learning models, including Random Forests (RF), Support Vector Machines (SVM), Multi-Layer Perceptrons (MLP), and K-Nearest Neighbors (KNN), were trained and tested with optimized parameters for each modality. In addition, the effectiveness of different multimodal fusion strategies was systematically examined. The unimodal experiments revealed that the physiological modality achieved the highest performance in the binary stress classification task (F1-score = 0.751), whereas the audio modality outperformed the others in the three-class classification task (F1-score = 0.625). In the multimodal setting, feature-level fusion yielded stable improvements in the binary classification task, while decision-level fusion achieved superior performance in the three-class classification task (F1-score = 0.65). These findings demonstrate that multimodal integration can substantially enhance the accuracy of stress recognition. Future research directions include incorporating temporal modeling and addressing data imbalance to further improve the robustness and applicability of stress recognition systems.
%U https://aclanthology.org/2025.rocling-main.4/
%P 29-34
Markdown (Informal)
[Multimodal Approaches for Stress Recognition: A Comparative Study Using the StressID Dataset](https://aclanthology.org/2025.rocling-main.4/) (Lee et al., ROCLING 2025)
ACL