@inproceedings{paniv-etal-2025-benchmarking,
title = "Benchmarking Multimodal Models for {U}krainian Language Understanding Across Academic and Cultural Domains",
author = "Paniv, Yurii and
Kiulian, Artur and
Chaplynskyi, Dmytro and
Khandoga, Mykola and
Polishko, Anton and
Bas, Tetiana and
Gabrielli, Guillermo",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.unlp-1.2/",
doi = "10.18653/v1/2025.unlp-1.2",
pages = "14--26",
ISBN = "979-8-89176-269-5",
abstract = "While the evaluation of multimodal English-centric models is an active area of research with numerous benchmarks, there is a profound lack of benchmarks or evaluation suites for low- and mid-resource languages. We introduce ZNO-Vision, a comprehensive multimodal Ukrainian-centric benchmark derived from the standardized university entrance examination (ZNO). The benchmark consists of over 4300 expert-crafted questions spanning 12 academic disciplines, including mathematics, physics, chemistry, and humanities. We evaluated the performance of both open-source models and API providers, finding that only a handful of models performed above baseline. Alongside the new benchmark, we performed the first evaluation study of multimodal text generation for the Ukrainian language: we measured caption generation quality on the Multi30K-UK dataset. Lastly, we tested a few models from a cultural perspective on knowledge of national cuisine. We believe our work will advance multimodal generation capabilities for the Ukrainian language and our approach could be useful for other low-resource languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="paniv-etal-2025-benchmarking">
<titleInfo>
<title>Benchmarking Multimodal Models for Ukrainian Language Understanding Across Academic and Cultural Domains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yurii</namePart>
<namePart type="family">Paniv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artur</namePart>
<namePart type="family">Kiulian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mykola</namePart>
<namePart type="family">Khandoga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anton</namePart>
<namePart type="family">Polishko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tetiana</namePart>
<namePart type="family">Bas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guillermo</namePart>
<namePart type="family">Gabrielli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-269-5</identifier>
</relatedItem>
<abstract>While the evaluation of multimodal English-centric models is an active area of research with numerous benchmarks, there is a profound lack of benchmarks or evaluation suites for low- and mid-resource languages. We introduce ZNO-Vision, a comprehensive multimodal Ukrainian-centric benchmark derived from the standardized university entrance examination (ZNO). The benchmark consists of over 4300 expert-crafted questions spanning 12 academic disciplines, including mathematics, physics, chemistry, and humanities. We evaluated the performance of both open-source models and API providers, finding that only a handful of models performed above baseline. Alongside the new benchmark, we performed the first evaluation study of multimodal text generation for the Ukrainian language: we measured caption generation quality on the Multi30K-UK dataset. Lastly, we tested a few models from a cultural perspective on knowledge of national cuisine. We believe our work will advance multimodal generation capabilities for the Ukrainian language and our approach could be useful for other low-resource languages.</abstract>
<identifier type="citekey">paniv-etal-2025-benchmarking</identifier>
<identifier type="doi">10.18653/v1/2025.unlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2025.unlp-1.2/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>14</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Benchmarking Multimodal Models for Ukrainian Language Understanding Across Academic and Cultural Domains
%A Paniv, Yurii
%A Kiulian, Artur
%A Chaplynskyi, Dmytro
%A Khandoga, Mykola
%A Polishko, Anton
%A Bas, Tetiana
%A Gabrielli, Guillermo
%Y Romanyshyn, Mariana
%S Proceedings of the Fourth Ukrainian Natural Language Processing Workshop (UNLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria (online)
%@ 979-8-89176-269-5
%F paniv-etal-2025-benchmarking
%X While the evaluation of multimodal English-centric models is an active area of research with numerous benchmarks, there is a profound lack of benchmarks or evaluation suites for low- and mid-resource languages. We introduce ZNO-Vision, a comprehensive multimodal Ukrainian-centric benchmark derived from the standardized university entrance examination (ZNO). The benchmark consists of over 4300 expert-crafted questions spanning 12 academic disciplines, including mathematics, physics, chemistry, and humanities. We evaluated the performance of both open-source models and API providers, finding that only a handful of models performed above baseline. Alongside the new benchmark, we performed the first evaluation study of multimodal text generation for the Ukrainian language: we measured caption generation quality on the Multi30K-UK dataset. Lastly, we tested a few models from a cultural perspective on knowledge of national cuisine. We believe our work will advance multimodal generation capabilities for the Ukrainian language and our approach could be useful for other low-resource languages.
%R 10.18653/v1/2025.unlp-1.2
%U https://aclanthology.org/2025.unlp-1.2/
%U https://doi.org/10.18653/v1/2025.unlp-1.2
%P 14-26
Markdown (Informal)
[Benchmarking Multimodal Models for Ukrainian Language Understanding Across Academic and Cultural Domains](https://aclanthology.org/2025.unlp-1.2/) (Paniv et al., UNLP 2025)
ACL