@inproceedings{mondal-sarkar-2025-modgenix,
title = "Modgenix at {S}em{E}val-2025 Task 1: Context Aware Vision Language Ranking ({CAV}i{LR}) for Multimodal Idiomaticity Understanding",
author = "Mondal, Joydeb and
Sarkar, Pramir",
editor = "Rosenthal, Sara and
Ros{\'a}, Aiala and
Ghosh, Debanjan and
Zampieri, Marcos",
booktitle = "Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.semeval-1.106/",
pages = "780--784",
ISBN = "979-8-89176-273-2",
abstract = "This paper presents CAViLR, a hybrid multimodal approach for SemEval-2025 Task 1. Our methodintegrates CLIP as a baseline with a Mixture of Experts (MoE) framework that dynamically selectsexpert models such as Pixtral-12B and Phi-3.5 based on input context. The approach addresseschallenges in both image ranking and image sequence prediction, improving the alignment of visualand textual semantics. Experimental results demonstrate that our hybrid model outperforms individualmodels. Future work will focus on refining expert selection and enhancing disambiguation strategiesfor complex idiomatic expressions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mondal-sarkar-2025-modgenix">
<titleInfo>
<title>Modgenix at SemEval-2025 Task 1: Context Aware Vision Language Ranking (CAViLR) for Multimodal Idiomaticity Understanding</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joydeb</namePart>
<namePart type="family">Mondal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pramir</namePart>
<namePart type="family">Sarkar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Rosenthal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aiala</namePart>
<namePart type="family">Rosá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-273-2</identifier>
</relatedItem>
<abstract>This paper presents CAViLR, a hybrid multimodal approach for SemEval-2025 Task 1. Our methodintegrates CLIP as a baseline with a Mixture of Experts (MoE) framework that dynamically selectsexpert models such as Pixtral-12B and Phi-3.5 based on input context. The approach addresseschallenges in both image ranking and image sequence prediction, improving the alignment of visualand textual semantics. Experimental results demonstrate that our hybrid model outperforms individualmodels. Future work will focus on refining expert selection and enhancing disambiguation strategiesfor complex idiomatic expressions.</abstract>
<identifier type="citekey">mondal-sarkar-2025-modgenix</identifier>
<location>
<url>https://aclanthology.org/2025.semeval-1.106/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>780</start>
<end>784</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modgenix at SemEval-2025 Task 1: Context Aware Vision Language Ranking (CAViLR) for Multimodal Idiomaticity Understanding
%A Mondal, Joydeb
%A Sarkar, Pramir
%Y Rosenthal, Sara
%Y Rosá, Aiala
%Y Ghosh, Debanjan
%Y Zampieri, Marcos
%S Proceedings of the 19th International Workshop on Semantic Evaluation (SemEval-2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-273-2
%F mondal-sarkar-2025-modgenix
%X This paper presents CAViLR, a hybrid multimodal approach for SemEval-2025 Task 1. Our methodintegrates CLIP as a baseline with a Mixture of Experts (MoE) framework that dynamically selectsexpert models such as Pixtral-12B and Phi-3.5 based on input context. The approach addresseschallenges in both image ranking and image sequence prediction, improving the alignment of visualand textual semantics. Experimental results demonstrate that our hybrid model outperforms individualmodels. Future work will focus on refining expert selection and enhancing disambiguation strategiesfor complex idiomatic expressions.
%U https://aclanthology.org/2025.semeval-1.106/
%P 780-784
Markdown (Informal)
[Modgenix at SemEval-2025 Task 1: Context Aware Vision Language Ranking (CAViLR) for Multimodal Idiomaticity Understanding](https://aclanthology.org/2025.semeval-1.106/) (Mondal & Sarkar, SemEval 2025)
ACL