@inproceedings{attanasio-etal-2022-milanlp,
title = "{M}ila{NLP} at {S}em{E}val-2022 Task 5: Using Perceiver {IO} for Detecting Misogynous Memes with Text and Image Modalities",
author = "Attanasio, Giuseppe and
Nozza, Debora and
Bianchi, Federico",
editor = "Emerson, Guy and
Schluter, Natalie and
Stanovsky, Gabriel and
Kumar, Ritesh and
Palmer, Alexis and
Schneider, Nathan and
Singh, Siddharth and
Ratan, Shyam",
booktitle = "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.semeval-1.90",
doi = "10.18653/v1/2022.semeval-1.90",
pages = "654--662",
abstract = "In this paper, we describe the system proposed by the MilaNLP team for the Multimedia Automatic Misogyny Identification (MAMI) challenge. We use Perceiver IO as a multimodal late fusion over unimodal streams to address both sub-tasks A and B. We build unimodal embeddings using Vision Transformer (image) and RoBERTa (text transcript). We enrich the input representation using face and demographic recognition, image captioning, and detection of adult content and web entities. To the best of our knowledge, this work is the first to use Perceiver IO combining text and image modalities. The proposed approach outperforms unimodal and multimodal baselines.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="attanasio-etal-2022-milanlp">
<titleInfo>
<title>MilaNLP at SemEval-2022 Task 5: Using Perceiver IO for Detecting Misogynous Memes with Text and Image Modalities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Attanasio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debora</namePart>
<namePart type="family">Nozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Bianchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Emerson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathan</namePart>
<namePart type="family">Schneider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siddharth</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shyam</namePart>
<namePart type="family">Ratan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we describe the system proposed by the MilaNLP team for the Multimedia Automatic Misogyny Identification (MAMI) challenge. We use Perceiver IO as a multimodal late fusion over unimodal streams to address both sub-tasks A and B. We build unimodal embeddings using Vision Transformer (image) and RoBERTa (text transcript). We enrich the input representation using face and demographic recognition, image captioning, and detection of adult content and web entities. To the best of our knowledge, this work is the first to use Perceiver IO combining text and image modalities. The proposed approach outperforms unimodal and multimodal baselines.</abstract>
<identifier type="citekey">attanasio-etal-2022-milanlp</identifier>
<identifier type="doi">10.18653/v1/2022.semeval-1.90</identifier>
<location>
<url>https://aclanthology.org/2022.semeval-1.90</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>654</start>
<end>662</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MilaNLP at SemEval-2022 Task 5: Using Perceiver IO for Detecting Misogynous Memes with Text and Image Modalities
%A Attanasio, Giuseppe
%A Nozza, Debora
%A Bianchi, Federico
%Y Emerson, Guy
%Y Schluter, Natalie
%Y Stanovsky, Gabriel
%Y Kumar, Ritesh
%Y Palmer, Alexis
%Y Schneider, Nathan
%Y Singh, Siddharth
%Y Ratan, Shyam
%S Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F attanasio-etal-2022-milanlp
%X In this paper, we describe the system proposed by the MilaNLP team for the Multimedia Automatic Misogyny Identification (MAMI) challenge. We use Perceiver IO as a multimodal late fusion over unimodal streams to address both sub-tasks A and B. We build unimodal embeddings using Vision Transformer (image) and RoBERTa (text transcript). We enrich the input representation using face and demographic recognition, image captioning, and detection of adult content and web entities. To the best of our knowledge, this work is the first to use Perceiver IO combining text and image modalities. The proposed approach outperforms unimodal and multimodal baselines.
%R 10.18653/v1/2022.semeval-1.90
%U https://aclanthology.org/2022.semeval-1.90
%U https://doi.org/10.18653/v1/2022.semeval-1.90
%P 654-662
Markdown (Informal)
[MilaNLP at SemEval-2022 Task 5: Using Perceiver IO for Detecting Misogynous Memes with Text and Image Modalities](https://aclanthology.org/2022.semeval-1.90) (Attanasio et al., SemEval 2022)
ACL