@inproceedings{dehghani-etal-2025-emo3d,
title = "{E}mo3{D}: Metric and Benchmarking Dataset for 3{D} Facial Expression Generation from Emotion Description",
author = "Dehghani, Mahshid and
Shafiee, Amirahmad and
Shafiei, Ali and
Fallah, Neda and
Alizadeh, Farahmand and
Gholinejad, Mohammad Mehdi and
Behroozi, Hamid and
Habibi, Jafar and
Asgari, Ehsaneddin",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.173/",
doi = "10.18653/v1/2025.findings-naacl.173",
pages = "3158--3172",
ISBN = "979-8-89176-195-7",
abstract = "3D facial emotion modeling has important applications in areas such as animation design, virtual reality, and emotional human-computer interaction (HCI). However, existing models are constrained by limited emotion classes and insufficient datasets. To address this, we introduce Emo3D, an extensive ``Text-Image-Expression dataset'' that spans a wide spectrum of human emotions, each paired with images and 3D blendshapes. Leveraging Large Language Models (LLMs), we generate a diverse array of textual descriptions, enabling the capture of a broad range of emotional expressions. Using this unique dataset, we perform a comprehensive evaluation of fine-tuned language-based models and vision-language models, such as Contrastive Language-Image Pretraining (CLIP), for 3D facial expression synthesis. To better assess conveyed emotions, we introduce Emo3D metric, a new evaluation metric that aligns more closely with human perception than traditional Mean Squared Error (MSE). Unlike MSE, which focuses on numerical differences, Emo3D captures emotional nuances in visual-text alignment and semantic richness. Emo3D dataset and metric hold great potential for advancing applications in animation and virtual reality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dehghani-etal-2025-emo3d">
<titleInfo>
<title>Emo3D: Metric and Benchmarking Dataset for 3D Facial Expression Generation from Emotion Description</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mahshid</namePart>
<namePart type="family">Dehghani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amirahmad</namePart>
<namePart type="family">Shafiee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Shafiei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neda</namePart>
<namePart type="family">Fallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farahmand</namePart>
<namePart type="family">Alizadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Mehdi</namePart>
<namePart type="family">Gholinejad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamid</namePart>
<namePart type="family">Behroozi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jafar</namePart>
<namePart type="family">Habibi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsaneddin</namePart>
<namePart type="family">Asgari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>3D facial emotion modeling has important applications in areas such as animation design, virtual reality, and emotional human-computer interaction (HCI). However, existing models are constrained by limited emotion classes and insufficient datasets. To address this, we introduce Emo3D, an extensive “Text-Image-Expression dataset” that spans a wide spectrum of human emotions, each paired with images and 3D blendshapes. Leveraging Large Language Models (LLMs), we generate a diverse array of textual descriptions, enabling the capture of a broad range of emotional expressions. Using this unique dataset, we perform a comprehensive evaluation of fine-tuned language-based models and vision-language models, such as Contrastive Language-Image Pretraining (CLIP), for 3D facial expression synthesis. To better assess conveyed emotions, we introduce Emo3D metric, a new evaluation metric that aligns more closely with human perception than traditional Mean Squared Error (MSE). Unlike MSE, which focuses on numerical differences, Emo3D captures emotional nuances in visual-text alignment and semantic richness. Emo3D dataset and metric hold great potential for advancing applications in animation and virtual reality.</abstract>
<identifier type="citekey">dehghani-etal-2025-emo3d</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.173</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.173/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>3158</start>
<end>3172</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Emo3D: Metric and Benchmarking Dataset for 3D Facial Expression Generation from Emotion Description
%A Dehghani, Mahshid
%A Shafiee, Amirahmad
%A Shafiei, Ali
%A Fallah, Neda
%A Alizadeh, Farahmand
%A Gholinejad, Mohammad Mehdi
%A Behroozi, Hamid
%A Habibi, Jafar
%A Asgari, Ehsaneddin
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F dehghani-etal-2025-emo3d
%X 3D facial emotion modeling has important applications in areas such as animation design, virtual reality, and emotional human-computer interaction (HCI). However, existing models are constrained by limited emotion classes and insufficient datasets. To address this, we introduce Emo3D, an extensive “Text-Image-Expression dataset” that spans a wide spectrum of human emotions, each paired with images and 3D blendshapes. Leveraging Large Language Models (LLMs), we generate a diverse array of textual descriptions, enabling the capture of a broad range of emotional expressions. Using this unique dataset, we perform a comprehensive evaluation of fine-tuned language-based models and vision-language models, such as Contrastive Language-Image Pretraining (CLIP), for 3D facial expression synthesis. To better assess conveyed emotions, we introduce Emo3D metric, a new evaluation metric that aligns more closely with human perception than traditional Mean Squared Error (MSE). Unlike MSE, which focuses on numerical differences, Emo3D captures emotional nuances in visual-text alignment and semantic richness. Emo3D dataset and metric hold great potential for advancing applications in animation and virtual reality.
%R 10.18653/v1/2025.findings-naacl.173
%U https://aclanthology.org/2025.findings-naacl.173/
%U https://doi.org/10.18653/v1/2025.findings-naacl.173
%P 3158-3172
Markdown (Informal)
[Emo3D: Metric and Benchmarking Dataset for 3D Facial Expression Generation from Emotion Description](https://aclanthology.org/2025.findings-naacl.173/) (Dehghani et al., Findings 2025)
ACL
- Mahshid Dehghani, Amirahmad Shafiee, Ali Shafiei, Neda Fallah, Farahmand Alizadeh, Mohammad Mehdi Gholinejad, Hamid Behroozi, Jafar Habibi, and Ehsaneddin Asgari. 2025. Emo3D: Metric and Benchmarking Dataset for 3D Facial Expression Generation from Emotion Description. In Findings of the Association for Computational Linguistics: NAACL 2025, pages 3158–3172, Albuquerque, New Mexico. Association for Computational Linguistics.