@inproceedings{cai-etal-2019-multi,
title = "Multi-Modal Sarcasm Detection in {T}witter with Hierarchical Fusion Model",
author = "Cai, Yitao and
Cai, Huiyu and
Wan, Xiaojun",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1239/",
doi = "10.18653/v1/P19-1239",
pages = "2506--2515",
abstract = "Sarcasm is a subtle form of language in which people express the opposite of what is implied. Previous works of sarcasm detection focused on texts. However, more and more social media platforms like Twitter allow users to create multi-modal messages, including texts, images, and videos. It is insufficient to detect sarcasm from multi-model messages based only on texts. In this paper, we focus on multi-modal sarcasm detection for tweets consisting of texts and images in Twitter. We treat text features, image features and image attributes as three modalities and propose a multi-modal hierarchical fusion model to address this task. Our model first extracts image features and attribute features, and then leverages attribute features and bidirectional LSTM network to extract text features. Features of three modalities are then reconstructed and fused into one feature vector for prediction. We create a multi-modal sarcasm detection dataset based on Twitter. Evaluation results on the dataset demonstrate the efficacy of our proposed model and the usefulness of the three modalities."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cai-etal-2019-multi">
<titleInfo>
<title>Multi-Modal Sarcasm Detection in Twitter with Hierarchical Fusion Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yitao</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huiyu</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Sarcasm is a subtle form of language in which people express the opposite of what is implied. Previous works of sarcasm detection focused on texts. However, more and more social media platforms like Twitter allow users to create multi-modal messages, including texts, images, and videos. It is insufficient to detect sarcasm from multi-model messages based only on texts. In this paper, we focus on multi-modal sarcasm detection for tweets consisting of texts and images in Twitter. We treat text features, image features and image attributes as three modalities and propose a multi-modal hierarchical fusion model to address this task. Our model first extracts image features and attribute features, and then leverages attribute features and bidirectional LSTM network to extract text features. Features of three modalities are then reconstructed and fused into one feature vector for prediction. We create a multi-modal sarcasm detection dataset based on Twitter. Evaluation results on the dataset demonstrate the efficacy of our proposed model and the usefulness of the three modalities.</abstract>
<identifier type="citekey">cai-etal-2019-multi</identifier>
<identifier type="doi">10.18653/v1/P19-1239</identifier>
<location>
<url>https://aclanthology.org/P19-1239/</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>2506</start>
<end>2515</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-Modal Sarcasm Detection in Twitter with Hierarchical Fusion Model
%A Cai, Yitao
%A Cai, Huiyu
%A Wan, Xiaojun
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F cai-etal-2019-multi
%X Sarcasm is a subtle form of language in which people express the opposite of what is implied. Previous works of sarcasm detection focused on texts. However, more and more social media platforms like Twitter allow users to create multi-modal messages, including texts, images, and videos. It is insufficient to detect sarcasm from multi-model messages based only on texts. In this paper, we focus on multi-modal sarcasm detection for tweets consisting of texts and images in Twitter. We treat text features, image features and image attributes as three modalities and propose a multi-modal hierarchical fusion model to address this task. Our model first extracts image features and attribute features, and then leverages attribute features and bidirectional LSTM network to extract text features. Features of three modalities are then reconstructed and fused into one feature vector for prediction. We create a multi-modal sarcasm detection dataset based on Twitter. Evaluation results on the dataset demonstrate the efficacy of our proposed model and the usefulness of the three modalities.
%R 10.18653/v1/P19-1239
%U https://aclanthology.org/P19-1239/
%U https://doi.org/10.18653/v1/P19-1239
%P 2506-2515
Markdown (Informal)
[Multi-Modal Sarcasm Detection in Twitter with Hierarchical Fusion Model](https://aclanthology.org/P19-1239/) (Cai et al., ACL 2019)
ACL