@inproceedings{site-etal-2026-itunlp,
title = "{ITUNLP} at {MWE}-2026 {A}d{MIR}e 2: A Zero-Shot {LLM} Pipeline for Multimodal Idiom Understanding and Ranking",
author = {Site, Atakan and
Arslan, O{\u{g}}uz Ali and
Eryi{\u{g}}it, G{\"u}l{\c{s}}en},
editor = {Ojha, Atul Kr. and
Mititelu, Verginica Barbu and
Constant, Mathieu and
Stoyanova, Ivelina and
Do{\u{g}}ru{\"o}z, A. Seza and
Rademaker, Alexandre},
booktitle = "Proceedings of the 22nd Workshop on Multiword Expressions ({MWE} 2026)",
month = mar,
year = "2026",
address = "Rabat, Marocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mwe-1.30/",
pages = "226--236",
ISBN = "979-8-89176-363-0",
abstract = "This paper presents our system for AdMIRe 2 (Advancing Multimodal Idiomaticity Representation), a shared task on multilingual multimodal idiom understanding. The task focuses on ranking images according to how well they depict the literal or idiomatic usage of potentially idiomatic expressions (PIEs) in context, across 15 languages and two tracks: a text-only track, and a multimodal track that uses both images and captions. To tackle both tracks, we propose a hybrid zero-shot pipeline built on large vision{--}language models (LVLMs). Our system employs a chain-of-thought prompting scheme that first classifies each PIE usage as literal or idiomatic and then ranks candidate images by their alignment with the inferred meaning.A primary{--}fallback routing mechanism increases robustness to safety-filter refusals, while lightweight post-processing recovers consistent rankings from imperfect model outputs.Without any task-specific fine-tuning, our approach achieves 55.9{\%} Top-1 Accuracy in the text-only track and 60.1{\%} in the multimodal (text+image) track, ranking first overall on the official leaderboard. These results suggest that carefully designed zero-shot LVLM pipelines can provide strong baselines for multilingual multimodal idiomaticity benchmarks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="site-etal-2026-itunlp">
<titleInfo>
<title>ITUNLP at MWE-2026 AdMIRe 2: A Zero-Shot LLM Pipeline for Multimodal Idiom Understanding and Ranking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atakan</namePart>
<namePart type="family">Site</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oğuz</namePart>
<namePart type="given">Ali</namePart>
<namePart type="family">Arslan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gülşen</namePart>
<namePart type="family">Eryiğit</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Workshop on Multiword Expressions (MWE 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verginica</namePart>
<namePart type="given">Barbu</namePart>
<namePart type="family">Mititelu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Constant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivelina</namePart>
<namePart type="family">Stoyanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">Seza</namePart>
<namePart type="family">Doğruöz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Rademaker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Marocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-363-0</identifier>
</relatedItem>
<abstract>This paper presents our system for AdMIRe 2 (Advancing Multimodal Idiomaticity Representation), a shared task on multilingual multimodal idiom understanding. The task focuses on ranking images according to how well they depict the literal or idiomatic usage of potentially idiomatic expressions (PIEs) in context, across 15 languages and two tracks: a text-only track, and a multimodal track that uses both images and captions. To tackle both tracks, we propose a hybrid zero-shot pipeline built on large vision–language models (LVLMs). Our system employs a chain-of-thought prompting scheme that first classifies each PIE usage as literal or idiomatic and then ranks candidate images by their alignment with the inferred meaning.A primary–fallback routing mechanism increases robustness to safety-filter refusals, while lightweight post-processing recovers consistent rankings from imperfect model outputs.Without any task-specific fine-tuning, our approach achieves 55.9% Top-1 Accuracy in the text-only track and 60.1% in the multimodal (text+image) track, ranking first overall on the official leaderboard. These results suggest that carefully designed zero-shot LVLM pipelines can provide strong baselines for multilingual multimodal idiomaticity benchmarks.</abstract>
<identifier type="citekey">site-etal-2026-itunlp</identifier>
<location>
<url>https://aclanthology.org/2026.mwe-1.30/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>226</start>
<end>236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ITUNLP at MWE-2026 AdMIRe 2: A Zero-Shot LLM Pipeline for Multimodal Idiom Understanding and Ranking
%A Site, Atakan
%A Arslan, Oğuz Ali
%A Eryiğit, Gülşen
%Y Ojha, Atul Kr.
%Y Mititelu, Verginica Barbu
%Y Constant, Mathieu
%Y Stoyanova, Ivelina
%Y Doğruöz, A. Seza
%Y Rademaker, Alexandre
%S Proceedings of the 22nd Workshop on Multiword Expressions (MWE 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Marocco
%@ 979-8-89176-363-0
%F site-etal-2026-itunlp
%X This paper presents our system for AdMIRe 2 (Advancing Multimodal Idiomaticity Representation), a shared task on multilingual multimodal idiom understanding. The task focuses on ranking images according to how well they depict the literal or idiomatic usage of potentially idiomatic expressions (PIEs) in context, across 15 languages and two tracks: a text-only track, and a multimodal track that uses both images and captions. To tackle both tracks, we propose a hybrid zero-shot pipeline built on large vision–language models (LVLMs). Our system employs a chain-of-thought prompting scheme that first classifies each PIE usage as literal or idiomatic and then ranks candidate images by their alignment with the inferred meaning.A primary–fallback routing mechanism increases robustness to safety-filter refusals, while lightweight post-processing recovers consistent rankings from imperfect model outputs.Without any task-specific fine-tuning, our approach achieves 55.9% Top-1 Accuracy in the text-only track and 60.1% in the multimodal (text+image) track, ranking first overall on the official leaderboard. These results suggest that carefully designed zero-shot LVLM pipelines can provide strong baselines for multilingual multimodal idiomaticity benchmarks.
%U https://aclanthology.org/2026.mwe-1.30/
%P 226-236
Markdown (Informal)
[ITUNLP at MWE-2026 AdMIRe 2: A Zero-Shot LLM Pipeline for Multimodal Idiom Understanding and Ranking](https://aclanthology.org/2026.mwe-1.30/) (Site et al., MWE 2026)
ACL