@inproceedings{sun-etal-2025-llava,
title = "{LL}a{VA}-{RE}: Binary Image-Text Relevancy Evaluation with Multimodal Large Language Model",
author = "Sun, Tao and
Liu, Oliver and
Li, JinJin and
Ma, Lan",
editor = "Zhang, Wei Emma and
Dai, Xiang and
Elliot, Desmond and
Fang, Byron and
Sim, Mongyuan and
Zhuang, Haojie and
Chen, Weitong",
booktitle = "Proceedings of the First Workshop of Evaluation of Multi-Modal Generation",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.evalmg-1.4/",
pages = "40--51",
abstract = "Multimodal generative AI usually involves generating image or text responses given inputs in another modality. The evaluation of image-text relevancy is essential for measuring the response quality or ranking candidate responses. In particular, binary relevancy evaluation, i.e., {\textquotedblleft}Relevant{\textquotedblright} vs. {\textquotedblleft}Not Relevant{\textquotedblright}, is a fundamental problem. However, this is a challenging task considering that texts have diverse formats and the definition of relevancy varies in different scenarios. We find that Multimodal Large Language Models (MLLMs) are an ideal choice to build such evaluators, as they can flexibly handle complex text formats and take in additional task information. In this paper, we present LLaVA-RE, a first attempt for binary image-text relevancy evaluation with MLLM. It follows the LLaVA architecture and adopts detailed task instructions and multimodal in-context samples. Further, we propose a novel binary relevancy dataset covering diverse tasks. Experimental results validate the effectiveness of our framework."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-etal-2025-llava">
<titleInfo>
<title>LLaVA-RE: Binary Image-Text Relevancy Evaluation with Multimodal Large Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tao</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oliver</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">JinJin</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lan</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop of Evaluation of Multi-Modal Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="given">Emma</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiang</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Desmond</namePart>
<namePart type="family">Elliot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byron</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mongyuan</namePart>
<namePart type="family">Sim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haojie</namePart>
<namePart type="family">Zhuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weitong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal generative AI usually involves generating image or text responses given inputs in another modality. The evaluation of image-text relevancy is essential for measuring the response quality or ranking candidate responses. In particular, binary relevancy evaluation, i.e., “Relevant” vs. “Not Relevant”, is a fundamental problem. However, this is a challenging task considering that texts have diverse formats and the definition of relevancy varies in different scenarios. We find that Multimodal Large Language Models (MLLMs) are an ideal choice to build such evaluators, as they can flexibly handle complex text formats and take in additional task information. In this paper, we present LLaVA-RE, a first attempt for binary image-text relevancy evaluation with MLLM. It follows the LLaVA architecture and adopts detailed task instructions and multimodal in-context samples. Further, we propose a novel binary relevancy dataset covering diverse tasks. Experimental results validate the effectiveness of our framework.</abstract>
<identifier type="citekey">sun-etal-2025-llava</identifier>
<location>
<url>https://aclanthology.org/2025.evalmg-1.4/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>40</start>
<end>51</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLaVA-RE: Binary Image-Text Relevancy Evaluation with Multimodal Large Language Model
%A Sun, Tao
%A Liu, Oliver
%A Li, JinJin
%A Ma, Lan
%Y Zhang, Wei Emma
%Y Dai, Xiang
%Y Elliot, Desmond
%Y Fang, Byron
%Y Sim, Mongyuan
%Y Zhuang, Haojie
%Y Chen, Weitong
%S Proceedings of the First Workshop of Evaluation of Multi-Modal Generation
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F sun-etal-2025-llava
%X Multimodal generative AI usually involves generating image or text responses given inputs in another modality. The evaluation of image-text relevancy is essential for measuring the response quality or ranking candidate responses. In particular, binary relevancy evaluation, i.e., “Relevant” vs. “Not Relevant”, is a fundamental problem. However, this is a challenging task considering that texts have diverse formats and the definition of relevancy varies in different scenarios. We find that Multimodal Large Language Models (MLLMs) are an ideal choice to build such evaluators, as they can flexibly handle complex text formats and take in additional task information. In this paper, we present LLaVA-RE, a first attempt for binary image-text relevancy evaluation with MLLM. It follows the LLaVA architecture and adopts detailed task instructions and multimodal in-context samples. Further, we propose a novel binary relevancy dataset covering diverse tasks. Experimental results validate the effectiveness of our framework.
%U https://aclanthology.org/2025.evalmg-1.4/
%P 40-51
Markdown (Informal)
[LLaVA-RE: Binary Image-Text Relevancy Evaluation with Multimodal Large Language Model](https://aclanthology.org/2025.evalmg-1.4/) (Sun et al., EvalMG 2025)
ACL