@inproceedings{laba-hryniv-2026-sparse,
title = "From Sparse to Sense-Grounded: {W}ikipedia Training for {U}krainian Visual-{WSD}",
author = "Laba, Yurii and
Hryniv, Rostyslav O.",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.conll-main.29/",
pages = "501--514",
ISBN = "979-8-89176-410-1",
abstract = "Visual Word Sense Disambiguation (Visual-WSD) requires ranking the correct image for an ambiguous word given a short trigger phrase. For low-resource languages, it is bottle{\-}necked by scarce sense-level benchmarks and limited sense-aligned multimodal supervision. We study Ukrainian and (i) extend the Ukrainian Visual-WSD benchmark from 87 to 381 instances and benchmark multilingual CLIP checkpoints and multimodal large models, and (ii) introduce two scalable Wikipedia-derived dataset construction methods. Using compute-efficient adaptation we fine-tune a multilingual CLIP backbone and show that sense-grounded supervision drives the improvements: combining our two Wikipedia-derived datasets improves HIT@1 from 37.00{\%} to 43.05{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="laba-hryniv-2026-sparse">
<titleInfo>
<title>From Sparse to Sense-Grounded: Wikipedia Training for Ukrainian Visual-WSD</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yurii</namePart>
<namePart type="family">Laba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rostyslav</namePart>
<namePart type="given">O</namePart>
<namePart type="family">Hryniv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 30th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevgeni</namePart>
<namePart type="family">Berzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-410-1</identifier>
</relatedItem>
<abstract>Visual Word Sense Disambiguation (Visual-WSD) requires ranking the correct image for an ambiguous word given a short trigger phrase. For low-resource languages, it is bottle\-necked by scarce sense-level benchmarks and limited sense-aligned multimodal supervision. We study Ukrainian and (i) extend the Ukrainian Visual-WSD benchmark from 87 to 381 instances and benchmark multilingual CLIP checkpoints and multimodal large models, and (ii) introduce two scalable Wikipedia-derived dataset construction methods. Using compute-efficient adaptation we fine-tune a multilingual CLIP backbone and show that sense-grounded supervision drives the improvements: combining our two Wikipedia-derived datasets improves HIT@1 from 37.00% to 43.05%.</abstract>
<identifier type="citekey">laba-hryniv-2026-sparse</identifier>
<location>
<url>https://aclanthology.org/2026.conll-main.29/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>501</start>
<end>514</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Sparse to Sense-Grounded: Wikipedia Training for Ukrainian Visual-WSD
%A Laba, Yurii
%A Hryniv, Rostyslav O.
%Y Bonial, Claire
%Y Berzak, Yevgeni
%S Proceedings of the 30th Conference on Computational Natural Language Learning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-410-1
%F laba-hryniv-2026-sparse
%X Visual Word Sense Disambiguation (Visual-WSD) requires ranking the correct image for an ambiguous word given a short trigger phrase. For low-resource languages, it is bottle\-necked by scarce sense-level benchmarks and limited sense-aligned multimodal supervision. We study Ukrainian and (i) extend the Ukrainian Visual-WSD benchmark from 87 to 381 instances and benchmark multilingual CLIP checkpoints and multimodal large models, and (ii) introduce two scalable Wikipedia-derived dataset construction methods. Using compute-efficient adaptation we fine-tune a multilingual CLIP backbone and show that sense-grounded supervision drives the improvements: combining our two Wikipedia-derived datasets improves HIT@1 from 37.00% to 43.05%.
%U https://aclanthology.org/2026.conll-main.29/
%P 501-514
Markdown (Informal)
[From Sparse to Sense-Grounded: Wikipedia Training for Ukrainian Visual-WSD](https://aclanthology.org/2026.conll-main.29/) (Laba & Hryniv, CoNLL 2026)
ACL