@inproceedings{sanchez-villegas-etal-2024-improving,
title = "Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary Tasks",
author = "Sanchez Villegas, Danae and
Preotiuc-Pietro, Daniel and
Aletras, Nikolaos",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2024",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-eacl.76/",
pages = "1126--1137",
abstract = "Effectively leveraging multimodal information from social media posts is essential to various downstream tasks such as sentiment analysis, sarcasm detection or hate speech classification. Jointly modeling text and images is challenging because cross-modal semantics might be hidden or the relation between image and text is weak. However, prior work on multimodal classification of social media posts has not yet addressed these challenges. In this work, we present an extensive study on the effectiveness of using two auxiliary losses jointly with the main task during fine-tuning multimodal models. First, Image-Text Contrastive (ITC) is designed to minimize the distance between image-text representations within a post, thereby effectively bridging the gap between posts where the image plays an important role in conveying the post`s meaning. Second, Image-Text Matching (ITM) enhances the model`s ability to understand the semantic relationship between images and text, thus improving its capacity to handle ambiguous or loosely related posts. We combine these objectives with five multimodal models, demonstrating consistent improvements of up to 2.6 F1 score across five diverse social media datasets. Our comprehensive analysis shows the specific scenarios where each auxiliary task is most effective."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sanchez-villegas-etal-2024-improving">
<titleInfo>
<title>Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Danae</namePart>
<namePart type="family">Sanchez Villegas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preotiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Effectively leveraging multimodal information from social media posts is essential to various downstream tasks such as sentiment analysis, sarcasm detection or hate speech classification. Jointly modeling text and images is challenging because cross-modal semantics might be hidden or the relation between image and text is weak. However, prior work on multimodal classification of social media posts has not yet addressed these challenges. In this work, we present an extensive study on the effectiveness of using two auxiliary losses jointly with the main task during fine-tuning multimodal models. First, Image-Text Contrastive (ITC) is designed to minimize the distance between image-text representations within a post, thereby effectively bridging the gap between posts where the image plays an important role in conveying the post‘s meaning. Second, Image-Text Matching (ITM) enhances the model‘s ability to understand the semantic relationship between images and text, thus improving its capacity to handle ambiguous or loosely related posts. We combine these objectives with five multimodal models, demonstrating consistent improvements of up to 2.6 F1 score across five diverse social media datasets. Our comprehensive analysis shows the specific scenarios where each auxiliary task is most effective.</abstract>
<identifier type="citekey">sanchez-villegas-etal-2024-improving</identifier>
<location>
<url>https://aclanthology.org/2024.findings-eacl.76/</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>1126</start>
<end>1137</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary Tasks
%A Sanchez Villegas, Danae
%A Preotiuc-Pietro, Daniel
%A Aletras, Nikolaos
%Y Graham, Yvette
%Y Purver, Matthew
%S Findings of the Association for Computational Linguistics: EACL 2024
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F sanchez-villegas-etal-2024-improving
%X Effectively leveraging multimodal information from social media posts is essential to various downstream tasks such as sentiment analysis, sarcasm detection or hate speech classification. Jointly modeling text and images is challenging because cross-modal semantics might be hidden or the relation between image and text is weak. However, prior work on multimodal classification of social media posts has not yet addressed these challenges. In this work, we present an extensive study on the effectiveness of using two auxiliary losses jointly with the main task during fine-tuning multimodal models. First, Image-Text Contrastive (ITC) is designed to minimize the distance between image-text representations within a post, thereby effectively bridging the gap between posts where the image plays an important role in conveying the post‘s meaning. Second, Image-Text Matching (ITM) enhances the model‘s ability to understand the semantic relationship between images and text, thus improving its capacity to handle ambiguous or loosely related posts. We combine these objectives with five multimodal models, demonstrating consistent improvements of up to 2.6 F1 score across five diverse social media datasets. Our comprehensive analysis shows the specific scenarios where each auxiliary task is most effective.
%U https://aclanthology.org/2024.findings-eacl.76/
%P 1126-1137
Markdown (Informal)
[Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary Tasks](https://aclanthology.org/2024.findings-eacl.76/) (Sanchez Villegas et al., Findings 2024)
ACL