@inproceedings{wang-etal-2025-dutir914,
title = "dutir914 at {S}em{E}val-2025 Task 1: An integrated approach for Multimodal Idiomaticity Representations",
author = "Wang, Yanan and
Li, Dailin and
Tian, Yicen and
Zhang, Bo and
Jian, Wang and
Yang, Liang",
editor = "Rosenthal, Sara and
Ros{\'a}, Aiala and
Ghosh, Debanjan and
Zampieri, Marcos",
booktitle = "Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.semeval-1.159/",
pages = "1198--1203",
ISBN = "979-8-89176-273-2",
abstract = "SemEval-2025 Task 1 introduces multimodal datasets for idiomatic expression representation. Subtask A focuses on ranking images based on potentially idiomatic noun compounds in given sentences. Idiom comprehension demands the fusion of visual and auditory elements with contextual semantics, yet existing datasets exhibit phrase-image discordance and culture-specific opacity, impeding cross-modal semantic alignment. To address these challenges, we propose an integrated approach that combines data augmentation and model fine-tuning in subtask A. First, we construct two idiom datasets by generating visual metaphors for idiomatic expressions to fine-tune the CLIP model. Next, We propose a three-stage multimodal chain-of-thought method, fine-tuning Qwen2.5-VL-7B-Instruct to generate rationales and perform inference, alongside zero-shot experiments with Qwen2.5-VL-72B-Instruct. Finally, we integrate the output of different models through a voting mechanism to enhance the accuracy of multimodal semantic matching. This approach achieves {\{}textbf{\{}0.92{\}}{\}} accuracy on the Portuguese test set and {\{}textbf{\{}0.93{\}}{\}} on the English test set, ranking {\{}textbf{\{}3rd{\}}{\}} and {\{}textbf{\{}4th{\}}{\}}, respectively. The implementation code is publicly available here{\{}footnote{\{}{\{}url{\{} https://github.com/wyn1015/semeval{\}}{\}}{\}}{\}}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-dutir914">
<titleInfo>
<title>dutir914 at SemEval-2025 Task 1: An integrated approach for Multimodal Idiomaticity Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dailin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yicen</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Rosenthal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiala</namePart>
<namePart type="family">Rosá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-273-2</identifier>
</relatedItem>
<abstract>SemEval-2025 Task 1 introduces multimodal datasets for idiomatic expression representation. Subtask A focuses on ranking images based on potentially idiomatic noun compounds in given sentences. Idiom comprehension demands the fusion of visual and auditory elements with contextual semantics, yet existing datasets exhibit phrase-image discordance and culture-specific opacity, impeding cross-modal semantic alignment. To address these challenges, we propose an integrated approach that combines data augmentation and model fine-tuning in subtask A. First, we construct two idiom datasets by generating visual metaphors for idiomatic expressions to fine-tune the CLIP model. Next, We propose a three-stage multimodal chain-of-thought method, fine-tuning Qwen2.5-VL-7B-Instruct to generate rationales and perform inference, alongside zero-shot experiments with Qwen2.5-VL-72B-Instruct. Finally, we integrate the output of different models through a voting mechanism to enhance the accuracy of multimodal semantic matching. This approach achieves {textbf{0.92}} accuracy on the Portuguese test set and {textbf{0.93}} on the English test set, ranking {textbf{3rd}} and {textbf{4th}}, respectively. The implementation code is publicly available here{footnote{{url{ https://github.com/wyn1015/semeval}}}}.</abstract>
<identifier type="citekey">wang-etal-2025-dutir914</identifier>
<location>
<url>https://aclanthology.org/2025.semeval-1.159/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1198</start>
<end>1203</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T dutir914 at SemEval-2025 Task 1: An integrated approach for Multimodal Idiomaticity Representations
%A Wang, Yanan
%A Li, Dailin
%A Tian, Yicen
%A Zhang, Bo
%A Jian, Wang
%A Yang, Liang
%Y Rosenthal, Sara
%Y Rosá, Aiala
%Y Ghosh, Debanjan
%Y Zampieri, Marcos
%S Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-273-2
%F wang-etal-2025-dutir914
%X SemEval-2025 Task 1 introduces multimodal datasets for idiomatic expression representation. Subtask A focuses on ranking images based on potentially idiomatic noun compounds in given sentences. Idiom comprehension demands the fusion of visual and auditory elements with contextual semantics, yet existing datasets exhibit phrase-image discordance and culture-specific opacity, impeding cross-modal semantic alignment. To address these challenges, we propose an integrated approach that combines data augmentation and model fine-tuning in subtask A. First, we construct two idiom datasets by generating visual metaphors for idiomatic expressions to fine-tune the CLIP model. Next, We propose a three-stage multimodal chain-of-thought method, fine-tuning Qwen2.5-VL-7B-Instruct to generate rationales and perform inference, alongside zero-shot experiments with Qwen2.5-VL-72B-Instruct. Finally, we integrate the output of different models through a voting mechanism to enhance the accuracy of multimodal semantic matching. This approach achieves {textbf{0.92}} accuracy on the Portuguese test set and {textbf{0.93}} on the English test set, ranking {textbf{3rd}} and {textbf{4th}}, respectively. The implementation code is publicly available here{footnote{{url{ https://github.com/wyn1015/semeval}}}}.
%U https://aclanthology.org/2025.semeval-1.159/
%P 1198-1203Markdown (Informal)
[dutir914 at SemEval-2025 Task 1: An integrated approach for Multimodal Idiomaticity Representations](https://aclanthology.org/2025.semeval-1.159/) (Wang et al., SemEval 2025)
ACL