@inproceedings{chen-etal-2024-textual,
title = "A Textual Modal Supplement Framework for Understanding Multi-Modal Figurative Language",
author = "Chen, Jiale and
Yang, Qihao and
Dong, Xuelian and
Mao, Xiaoling and
Hao, Tianyong",
editor = "Ghosh, Debanjan and
Muresan, Smaranda and
Feldman, Anna and
Chakrabarty, Tuhin and
Liu, Emmy",
booktitle = "Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.figlang-1.12",
doi = "10.18653/v1/2024.figlang-1.12",
pages = "85--91",
abstract = "Figurative language in media such as memes, art, or comics has gained dramatic interest recently. However, the challenge remains in accurately justifying and explaining whether an image caption complements or contradicts the image it accompanies. To tackle this problem, we design a modal-supplement framework MAPPER consisting of a describer and thinker. The describer based on a frozen large vision model is designed to describe an image in detail to capture entailed semantic information. The thinker based on a finetuned large multi-modal model is designed to utilize description, claim and image to make prediction and explanation. Experiment results on a publicly available benchmark dataset from FigLang2024 Task 2 show that our method ranks at top 1 in overall evaluation, the performance exceeds the second place by 28.57{\%}. This indicates that MAPPER is highly effective in understanding, judging and explaining of the figurative language. The source code is available at https://github.com/Libv-Team/figlang2024.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2024-textual">
<titleInfo>
<title>A Textual Modal Supplement Framework for Understanding Multi-Modal Figurative Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiale</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qihao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuelian</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoling</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianyong</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuhin</namePart>
<namePart type="family">Chakrabarty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Figurative language in media such as memes, art, or comics has gained dramatic interest recently. However, the challenge remains in accurately justifying and explaining whether an image caption complements or contradicts the image it accompanies. To tackle this problem, we design a modal-supplement framework MAPPER consisting of a describer and thinker. The describer based on a frozen large vision model is designed to describe an image in detail to capture entailed semantic information. The thinker based on a finetuned large multi-modal model is designed to utilize description, claim and image to make prediction and explanation. Experiment results on a publicly available benchmark dataset from FigLang2024 Task 2 show that our method ranks at top 1 in overall evaluation, the performance exceeds the second place by 28.57%. This indicates that MAPPER is highly effective in understanding, judging and explaining of the figurative language. The source code is available at https://github.com/Libv-Team/figlang2024.</abstract>
<identifier type="citekey">chen-etal-2024-textual</identifier>
<identifier type="doi">10.18653/v1/2024.figlang-1.12</identifier>
<location>
<url>https://aclanthology.org/2024.figlang-1.12</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>85</start>
<end>91</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Textual Modal Supplement Framework for Understanding Multi-Modal Figurative Language
%A Chen, Jiale
%A Yang, Qihao
%A Dong, Xuelian
%A Mao, Xiaoling
%A Hao, Tianyong
%Y Ghosh, Debanjan
%Y Muresan, Smaranda
%Y Feldman, Anna
%Y Chakrabarty, Tuhin
%Y Liu, Emmy
%S Proceedings of the 4th Workshop on Figurative Language Processing (FigLang 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico (Hybrid)
%F chen-etal-2024-textual
%X Figurative language in media such as memes, art, or comics has gained dramatic interest recently. However, the challenge remains in accurately justifying and explaining whether an image caption complements or contradicts the image it accompanies. To tackle this problem, we design a modal-supplement framework MAPPER consisting of a describer and thinker. The describer based on a frozen large vision model is designed to describe an image in detail to capture entailed semantic information. The thinker based on a finetuned large multi-modal model is designed to utilize description, claim and image to make prediction and explanation. Experiment results on a publicly available benchmark dataset from FigLang2024 Task 2 show that our method ranks at top 1 in overall evaluation, the performance exceeds the second place by 28.57%. This indicates that MAPPER is highly effective in understanding, judging and explaining of the figurative language. The source code is available at https://github.com/Libv-Team/figlang2024.
%R 10.18653/v1/2024.figlang-1.12
%U https://aclanthology.org/2024.figlang-1.12
%U https://doi.org/10.18653/v1/2024.figlang-1.12
%P 85-91
Markdown (Informal)
[A Textual Modal Supplement Framework for Understanding Multi-Modal Figurative Language](https://aclanthology.org/2024.figlang-1.12) (Chen et al., Fig-Lang-WS 2024)
ACL