@inproceedings{hyun-etal-2024-smile,
title = "{SMILE}: Multimodal Dataset for Understanding Laughter in Video with Language Models",
author = "Hyun, Lee and
Sung-Bin, Kim and
Han, Seungju and
Yu, Youngjae and
Oh, Tae-Hyun",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.73/",
doi = "10.18653/v1/2024.findings-naacl.73",
pages = "1149--1167",
abstract = "Despite the recent advances in artificial intelligence, building social intelligence remains a challenge.Among social signals, laughter is one of the distinctive expressions that occurs during social interactions between humans.In this work, we tackle a new challenge for machines to understand the rationale behind laughter in video, Video Laugh Reasoning.We introduce this new task to explain why people laugh in a particular video and a dataset for this task.Our proposed dataset, SMILE, comprises video clips and language descriptions of why people laugh. We propose a baseline by leveraging the reasoning capacity of large language models (LLMs) with textual video representation. Experiments show that our baseline can generate plausible explanations for laughter. We further investigate the scalability of our baseline by probing other video understanding tasks and in-the-wild videos. We release our dataset, code, and model checkpoints on https://github.com/postech-ami/SMILE-Dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hyun-etal-2024-smile">
<titleInfo>
<title>SMILE: Multimodal Dataset for Understanding Laughter in Video with Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lee</namePart>
<namePart type="family">Hyun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kim</namePart>
<namePart type="family">Sung-Bin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seungju</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Youngjae</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tae-Hyun</namePart>
<namePart type="family">Oh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite the recent advances in artificial intelligence, building social intelligence remains a challenge.Among social signals, laughter is one of the distinctive expressions that occurs during social interactions between humans.In this work, we tackle a new challenge for machines to understand the rationale behind laughter in video, Video Laugh Reasoning.We introduce this new task to explain why people laugh in a particular video and a dataset for this task.Our proposed dataset, SMILE, comprises video clips and language descriptions of why people laugh. We propose a baseline by leveraging the reasoning capacity of large language models (LLMs) with textual video representation. Experiments show that our baseline can generate plausible explanations for laughter. We further investigate the scalability of our baseline by probing other video understanding tasks and in-the-wild videos. We release our dataset, code, and model checkpoints on https://github.com/postech-ami/SMILE-Dataset.</abstract>
<identifier type="citekey">hyun-etal-2024-smile</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.73</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.73/</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>1149</start>
<end>1167</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SMILE: Multimodal Dataset for Understanding Laughter in Video with Language Models
%A Hyun, Lee
%A Sung-Bin, Kim
%A Han, Seungju
%A Yu, Youngjae
%A Oh, Tae-Hyun
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F hyun-etal-2024-smile
%X Despite the recent advances in artificial intelligence, building social intelligence remains a challenge.Among social signals, laughter is one of the distinctive expressions that occurs during social interactions between humans.In this work, we tackle a new challenge for machines to understand the rationale behind laughter in video, Video Laugh Reasoning.We introduce this new task to explain why people laugh in a particular video and a dataset for this task.Our proposed dataset, SMILE, comprises video clips and language descriptions of why people laugh. We propose a baseline by leveraging the reasoning capacity of large language models (LLMs) with textual video representation. Experiments show that our baseline can generate plausible explanations for laughter. We further investigate the scalability of our baseline by probing other video understanding tasks and in-the-wild videos. We release our dataset, code, and model checkpoints on https://github.com/postech-ami/SMILE-Dataset.
%R 10.18653/v1/2024.findings-naacl.73
%U https://aclanthology.org/2024.findings-naacl.73/
%U https://doi.org/10.18653/v1/2024.findings-naacl.73
%P 1149-1167
Markdown (Informal)
[SMILE: Multimodal Dataset for Understanding Laughter in Video with Language Models](https://aclanthology.org/2024.findings-naacl.73/) (Hyun et al., Findings 2024)
ACL