@inproceedings{yang-wang-2024-figclip,
title = "{F}ig{CLIP}: A Generative Multimodal Model with Bidirectional Cross-attention for Understanding Figurative Language via Visual Entailment",
author = "Yang, Qihao and
Wang, Xuelin",
editor = "Ghosh, Debanjan and
Muresan, Smaranda and
Feldman, Anna and
Chakrabarty, Tuhin and
Liu, Emmy",
booktitle = "Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.figlang-1.13",
doi = "10.18653/v1/2024.figlang-1.13",
pages = "92--98",
abstract = "This is a system paper for the FigLang-2024 Multimodal Figurative Language Shared Task. Figurative language is generally represented through multiple modalities, facilitating the expression of complex and abstract ideas. With the popularity of various text-to-image tools, a large number of images containing metaphors or ironies are created. Traditional recognizing textual entailment has been extended to the task of understanding figurative language via visual entailment. However, existing pre-trained multimodal models in open domains often struggle with this task due to the intertwining of counterfactuals, human culture, and imagination. To bridge this gap, we propose FigCLIP, an end-to-end model based on CLIP and GPT-2, to identify multimodal figurative semantics and generate explanations. It employs a bidirectional fusion module with cross-attention and leverages explanations to promote the alignment of figurative image-text representations. Experimental results on the benchmark demonstrate the effectiveness of our method, achieving 70{\%} F1-score, 67{\%} F1@50-score and 50{\%} F1@60-score. It outperforms GPT-4V, which has robust visual reasoning capabilities.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-wang-2024-figclip">
<titleInfo>
<title>FigCLIP: A Generative Multimodal Model with Bidirectional Cross-attention for Understanding Figurative Language via Visual Entailment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qihao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuelin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuhin</namePart>
<namePart type="family">Chakrabarty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This is a system paper for the FigLang-2024 Multimodal Figurative Language Shared Task. Figurative language is generally represented through multiple modalities, facilitating the expression of complex and abstract ideas. With the popularity of various text-to-image tools, a large number of images containing metaphors or ironies are created. Traditional recognizing textual entailment has been extended to the task of understanding figurative language via visual entailment. However, existing pre-trained multimodal models in open domains often struggle with this task due to the intertwining of counterfactuals, human culture, and imagination. To bridge this gap, we propose FigCLIP, an end-to-end model based on CLIP and GPT-2, to identify multimodal figurative semantics and generate explanations. It employs a bidirectional fusion module with cross-attention and leverages explanations to promote the alignment of figurative image-text representations. Experimental results on the benchmark demonstrate the effectiveness of our method, achieving 70% F1-score, 67% F1@50-score and 50% F1@60-score. It outperforms GPT-4V, which has robust visual reasoning capabilities.</abstract>
<identifier type="citekey">yang-wang-2024-figclip</identifier>
<identifier type="doi">10.18653/v1/2024.figlang-1.13</identifier>
<location>
<url>https://aclanthology.org/2024.figlang-1.13</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>92</start>
<end>98</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FigCLIP: A Generative Multimodal Model with Bidirectional Cross-attention for Understanding Figurative Language via Visual Entailment
%A Yang, Qihao
%A Wang, Xuelin
%Y Ghosh, Debanjan
%Y Muresan, Smaranda
%Y Feldman, Anna
%Y Chakrabarty, Tuhin
%Y Liu, Emmy
%S Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico (Hybrid)
%F yang-wang-2024-figclip
%X This is a system paper for the FigLang-2024 Multimodal Figurative Language Shared Task. Figurative language is generally represented through multiple modalities, facilitating the expression of complex and abstract ideas. With the popularity of various text-to-image tools, a large number of images containing metaphors or ironies are created. Traditional recognizing textual entailment has been extended to the task of understanding figurative language via visual entailment. However, existing pre-trained multimodal models in open domains often struggle with this task due to the intertwining of counterfactuals, human culture, and imagination. To bridge this gap, we propose FigCLIP, an end-to-end model based on CLIP and GPT-2, to identify multimodal figurative semantics and generate explanations. It employs a bidirectional fusion module with cross-attention and leverages explanations to promote the alignment of figurative image-text representations. Experimental results on the benchmark demonstrate the effectiveness of our method, achieving 70% F1-score, 67% F1@50-score and 50% F1@60-score. It outperforms GPT-4V, which has robust visual reasoning capabilities.
%R 10.18653/v1/2024.figlang-1.13
%U https://aclanthology.org/2024.figlang-1.13
%U https://doi.org/10.18653/v1/2024.figlang-1.13
%P 92-98
Markdown (Informal)
[FigCLIP: A Generative Multimodal Model with Bidirectional Cross-attention for Understanding Figurative Language via Visual Entailment](https://aclanthology.org/2024.figlang-1.13) (Yang & Wang, Fig-Lang-WS 2024)
ACL