@inproceedings{cotiga-nisioi-2026-dcsn,
title = "{DCSN}-{NLP} at {MWE}-2026 {A}d{MIR}e 2: Bridging Literal and Figurative Meaning Through Hierarchical Multimodal Reasoning",
author = "Cotig{\u{a}}, David and
Nisioi, Sergiu",
editor = {Ojha, Atul Kr. and
Mititelu, Verginica Barbu and
Constant, Mathieu and
Stoyanova, Ivelina and
Do{\u{g}}ru{\"o}z, A. Seza and
Rademaker, Alexandre},
booktitle = "Proceedings of the 22nd Workshop on Multiword Expressions ({MWE} 2026)",
month = mar,
year = "2026",
address = "Rabat, Marocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mwe-1.29/",
pages = "217--225",
ISBN = "979-8-89176-363-0",
abstract = "This paper presents our system for the MWE-2026 ADMiRe 2.0 shared task, which aimedto advance multimodal idiomatic understand-ing across 15 languages. We address the taskof selecting, from a set of five images, theone that best represents either the literal oridiomatic meaning of a given compound incontext. Our approach follows a multi-steppipeline: a large language model (LLM) firstdetermines whether the compound is used lit-erally or idiomatically and generates auxiliarytext, consisting of an idiomatic meaning expla-nation and a visual description of the literalmeaning. An ensemble of three CLIP modelsthen identifies the two images most semanti-cally similar to the appropriate generated textvia a voting mechanism. Finally, the LLM se-lects the best image from these two candidates."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cotiga-nisioi-2026-dcsn">
<titleInfo>
<title>DCSN-NLP at MWE-2026 AdMIRe 2: Bridging Literal and Figurative Meaning Through Hierarchical Multimodal Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Cotigă</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergiu</namePart>
<namePart type="family">Nisioi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 22nd Workshop on Multiword Expressions (MWE 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Verginica</namePart>
<namePart type="given">Barbu</namePart>
<namePart type="family">Mititelu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Constant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivelina</namePart>
<namePart type="family">Stoyanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">Seza</namePart>
<namePart type="family">Doğruöz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandre</namePart>
<namePart type="family">Rademaker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Marocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-363-0</identifier>
</relatedItem>
<abstract>This paper presents our system for the MWE-2026 ADMiRe 2.0 shared task, which aimedto advance multimodal idiomatic understand-ing across 15 languages. We address the taskof selecting, from a set of five images, theone that best represents either the literal oridiomatic meaning of a given compound incontext. Our approach follows a multi-steppipeline: a large language model (LLM) firstdetermines whether the compound is used lit-erally or idiomatically and generates auxiliarytext, consisting of an idiomatic meaning expla-nation and a visual description of the literalmeaning. An ensemble of three CLIP modelsthen identifies the two images most semanti-cally similar to the appropriate generated textvia a voting mechanism. Finally, the LLM se-lects the best image from these two candidates.</abstract>
<identifier type="citekey">cotiga-nisioi-2026-dcsn</identifier>
<location>
<url>https://aclanthology.org/2026.mwe-1.29/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>217</start>
<end>225</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DCSN-NLP at MWE-2026 AdMIRe 2: Bridging Literal and Figurative Meaning Through Hierarchical Multimodal Reasoning
%A Cotigă, David
%A Nisioi, Sergiu
%Y Ojha, Atul Kr.
%Y Mititelu, Verginica Barbu
%Y Constant, Mathieu
%Y Stoyanova, Ivelina
%Y Doğruöz, A. Seza
%Y Rademaker, Alexandre
%S Proceedings of the 22nd Workshop on Multiword Expressions (MWE 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Marocco
%@ 979-8-89176-363-0
%F cotiga-nisioi-2026-dcsn
%X This paper presents our system for the MWE-2026 ADMiRe 2.0 shared task, which aimedto advance multimodal idiomatic understand-ing across 15 languages. We address the taskof selecting, from a set of five images, theone that best represents either the literal oridiomatic meaning of a given compound incontext. Our approach follows a multi-steppipeline: a large language model (LLM) firstdetermines whether the compound is used lit-erally or idiomatically and generates auxiliarytext, consisting of an idiomatic meaning expla-nation and a visual description of the literalmeaning. An ensemble of three CLIP modelsthen identifies the two images most semanti-cally similar to the appropriate generated textvia a voting mechanism. Finally, the LLM se-lects the best image from these two candidates.
%U https://aclanthology.org/2026.mwe-1.29/
%P 217-225
Markdown (Informal)
[DCSN-NLP at MWE-2026 AdMIRe 2: Bridging Literal and Figurative Meaning Through Hierarchical Multimodal Reasoning](https://aclanthology.org/2026.mwe-1.29/) (Cotigă & Nisioi, MWE 2026)
ACL