@inproceedings{deshpande-etal-2022-fine,
title = "Fine-tuning of Convolutional Neural Networks for the Recognition of Facial Expressions in Sign Language Video Samples",
author = "Deshpande, Neha and
Nunnari, Fabrizio and
Avramidis, Eleftherios",
editor = "Efthimiou, Eleni and
Fotinea, Stavroula-Evita and
Hanke, Thomas and
McDonald, John C. and
Shterionov, Dimitar and
Wolfe, Rosalee",
booktitle = "Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.sltat-1.5",
pages = "29--38",
abstract = "In this paper, we investigate the capability of convolutional neural networks to recognize in sign language video frames the six basic Ekman facial expressions for {`}fear{'}, {`}disgust{'}, {`}surprise{'}, {`}sadness{'}, {`}happiness{'}, {`}anger{'} along with the {`}neutral{'} class. Given the limited amount of annotated facial expression data for the sign language domain, we started from a model pre-trained on general-purpose facial expression datasets and we applied various machine learning techniques such as fine-tuning, data augmentation, class balancing, as well as image preprocessing to reach a better accuracy. The models were evaluated using K-fold cross-validation to get more accurate conclusions. It is experimentally demonstrated that fine-tuning a pre-trained model along with data augmentation by horizontally flipping images and image normalization, helps in providing the best accuracy on the sign language dataset. The best setting achieves satisfactory classification accuracy, comparable to state-of-the-art systems in generic facial expression recognition. Experiments were performed using different combinations of the above-mentioned techniques based on two different architectures, namely MobileNet and EfficientNet, and is deemed that both architectures seem equally suitable for the purpose of fine-tuning, whereas class balancing is discouraged.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deshpande-etal-2022-fine">
<titleInfo>
<title>Fine-tuning of Convolutional Neural Networks for the Recognition of Facial Expressions in Sign Language Video Samples</title>
</titleInfo>
<name type="personal">
<namePart type="given">Neha</namePart>
<namePart type="family">Deshpande</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabrizio</namePart>
<namePart type="family">Nunnari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleftherios</namePart>
<namePart type="family">Avramidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eleni</namePart>
<namePart type="family">Efthimiou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stavroula-Evita</namePart>
<namePart type="family">Fotinea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Hanke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="given">C</namePart>
<namePart type="family">McDonald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitar</namePart>
<namePart type="family">Shterionov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rosalee</namePart>
<namePart type="family">Wolfe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we investigate the capability of convolutional neural networks to recognize in sign language video frames the six basic Ekman facial expressions for ‘fear’, ‘disgust’, ‘surprise’, ‘sadness’, ‘happiness’, ‘anger’ along with the ‘neutral’ class. Given the limited amount of annotated facial expression data for the sign language domain, we started from a model pre-trained on general-purpose facial expression datasets and we applied various machine learning techniques such as fine-tuning, data augmentation, class balancing, as well as image preprocessing to reach a better accuracy. The models were evaluated using K-fold cross-validation to get more accurate conclusions. It is experimentally demonstrated that fine-tuning a pre-trained model along with data augmentation by horizontally flipping images and image normalization, helps in providing the best accuracy on the sign language dataset. The best setting achieves satisfactory classification accuracy, comparable to state-of-the-art systems in generic facial expression recognition. Experiments were performed using different combinations of the above-mentioned techniques based on two different architectures, namely MobileNet and EfficientNet, and is deemed that both architectures seem equally suitable for the purpose of fine-tuning, whereas class balancing is discouraged.</abstract>
<identifier type="citekey">deshpande-etal-2022-fine</identifier>
<location>
<url>https://aclanthology.org/2022.sltat-1.5</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>29</start>
<end>38</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fine-tuning of Convolutional Neural Networks for the Recognition of Facial Expressions in Sign Language Video Samples
%A Deshpande, Neha
%A Nunnari, Fabrizio
%A Avramidis, Eleftherios
%Y Efthimiou, Eleni
%Y Fotinea, Stavroula-Evita
%Y Hanke, Thomas
%Y McDonald, John C.
%Y Shterionov, Dimitar
%Y Wolfe, Rosalee
%S Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F deshpande-etal-2022-fine
%X In this paper, we investigate the capability of convolutional neural networks to recognize in sign language video frames the six basic Ekman facial expressions for ‘fear’, ‘disgust’, ‘surprise’, ‘sadness’, ‘happiness’, ‘anger’ along with the ‘neutral’ class. Given the limited amount of annotated facial expression data for the sign language domain, we started from a model pre-trained on general-purpose facial expression datasets and we applied various machine learning techniques such as fine-tuning, data augmentation, class balancing, as well as image preprocessing to reach a better accuracy. The models were evaluated using K-fold cross-validation to get more accurate conclusions. It is experimentally demonstrated that fine-tuning a pre-trained model along with data augmentation by horizontally flipping images and image normalization, helps in providing the best accuracy on the sign language dataset. The best setting achieves satisfactory classification accuracy, comparable to state-of-the-art systems in generic facial expression recognition. Experiments were performed using different combinations of the above-mentioned techniques based on two different architectures, namely MobileNet and EfficientNet, and is deemed that both architectures seem equally suitable for the purpose of fine-tuning, whereas class balancing is discouraged.
%U https://aclanthology.org/2022.sltat-1.5
%P 29-38
Markdown (Informal)
[Fine-tuning of Convolutional Neural Networks for the Recognition of Facial Expressions in Sign Language Video Samples](https://aclanthology.org/2022.sltat-1.5) (Deshpande et al., SLTAT 2022)
ACL