@inproceedings{goworek-etal-2025-senwich,
title = "{S}en{W}i{C}h: Sense-Annotation of Low-Resource Languages for {W}i{C} using Hybrid Methods",
author = "Goworek, Roksana and
Karlcut, Harpal Singh and
Shezad, Hamza and
Darshana, Nijaguna and
Mane, Abhishek and
Bondada, Syam and
Sikka, Raghav and
Mammadov, Ulvi and
Allahverdiyev, Rauf and
Purighella, Sriram Satkirti and
Gupta, Paridhi and
Ndegwa, Muhinyia and
Tran, Bao Khanh and
Dubossarsky, Haim",
editor = "Hahn, Michael and
Rani, Priya and
Kumar, Ritesh and
Shcherbakov, Andreas and
Sorokin, Alexey and
Serikov, Oleg and
Cotterell, Ryan and
Vylomova, Ekaterina",
booktitle = "Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sigtyp-1.7/",
doi = "10.18653/v1/2025.sigtyp-1.7",
pages = "61--74",
ISBN = "979-8-89176-281-7",
abstract = "This paper addresses the critical need for high-quality evaluation datasets in low-resource languages to advance cross-lingual transfer. While cross-lingual transfer offers a key strategy for leveraging multilingual pretraining to expand language technologies to understudied and typologically diverse languages, its effectiveness is dependent on quality and suitable benchmarks. We release new sense-annotated datasets of sentences containing polysemous words, spanning nine low-resource languages across diverse language families and scripts. To facilitate dataset creation, the paper presents a demonstrably beneficial semi-automatic annotation method. The utility of the datasets is demonstrated through Word-in-Context (WiC) formatted experiments that evaluate transfer on these low-resource languages. Results highlight the importance of targeted dataset creation and evaluation for effective polysemy disambiguation in low-resource settings and transfer studies. The released datasets and code aim to support further research into fair, robust, and truly multilingual NLP."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="goworek-etal-2025-senwich">
<titleInfo>
<title>SenWiCh: Sense-Annotation of Low-Resource Languages for WiC using Hybrid Methods</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roksana</namePart>
<namePart type="family">Goworek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harpal</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Karlcut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamza</namePart>
<namePart type="family">Shezad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nijaguna</namePart>
<namePart type="family">Darshana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhishek</namePart>
<namePart type="family">Mane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syam</namePart>
<namePart type="family">Bondada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raghav</namePart>
<namePart type="family">Sikka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ulvi</namePart>
<namePart type="family">Mammadov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rauf</namePart>
<namePart type="family">Allahverdiyev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sriram</namePart>
<namePart type="given">Satkirti</namePart>
<namePart type="family">Purighella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paridhi</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhinyia</namePart>
<namePart type="family">Ndegwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bao</namePart>
<namePart type="given">Khanh</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haim</namePart>
<namePart type="family">Dubossarsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Priya</namePart>
<namePart type="family">Rani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Shcherbakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexey</namePart>
<namePart type="family">Sorokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-281-7</identifier>
</relatedItem>
<abstract>This paper addresses the critical need for high-quality evaluation datasets in low-resource languages to advance cross-lingual transfer. While cross-lingual transfer offers a key strategy for leveraging multilingual pretraining to expand language technologies to understudied and typologically diverse languages, its effectiveness is dependent on quality and suitable benchmarks. We release new sense-annotated datasets of sentences containing polysemous words, spanning nine low-resource languages across diverse language families and scripts. To facilitate dataset creation, the paper presents a demonstrably beneficial semi-automatic annotation method. The utility of the datasets is demonstrated through Word-in-Context (WiC) formatted experiments that evaluate transfer on these low-resource languages. Results highlight the importance of targeted dataset creation and evaluation for effective polysemy disambiguation in low-resource settings and transfer studies. The released datasets and code aim to support further research into fair, robust, and truly multilingual NLP.</abstract>
<identifier type="citekey">goworek-etal-2025-senwich</identifier>
<identifier type="doi">10.18653/v1/2025.sigtyp-1.7</identifier>
<location>
<url>https://aclanthology.org/2025.sigtyp-1.7/</url>
</location>
<part>
<date>2025-08</date>
<extent unit="page">
<start>61</start>
<end>74</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SenWiCh: Sense-Annotation of Low-Resource Languages for WiC using Hybrid Methods
%A Goworek, Roksana
%A Karlcut, Harpal Singh
%A Shezad, Hamza
%A Darshana, Nijaguna
%A Mane, Abhishek
%A Bondada, Syam
%A Sikka, Raghav
%A Mammadov, Ulvi
%A Allahverdiyev, Rauf
%A Purighella, Sriram Satkirti
%A Gupta, Paridhi
%A Ndegwa, Muhinyia
%A Tran, Bao Khanh
%A Dubossarsky, Haim
%Y Hahn, Michael
%Y Rani, Priya
%Y Kumar, Ritesh
%Y Shcherbakov, Andreas
%Y Sorokin, Alexey
%Y Serikov, Oleg
%Y Cotterell, Ryan
%Y Vylomova, Ekaterina
%S Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP
%D 2025
%8 August
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-281-7
%F goworek-etal-2025-senwich
%X This paper addresses the critical need for high-quality evaluation datasets in low-resource languages to advance cross-lingual transfer. While cross-lingual transfer offers a key strategy for leveraging multilingual pretraining to expand language technologies to understudied and typologically diverse languages, its effectiveness is dependent on quality and suitable benchmarks. We release new sense-annotated datasets of sentences containing polysemous words, spanning nine low-resource languages across diverse language families and scripts. To facilitate dataset creation, the paper presents a demonstrably beneficial semi-automatic annotation method. The utility of the datasets is demonstrated through Word-in-Context (WiC) formatted experiments that evaluate transfer on these low-resource languages. Results highlight the importance of targeted dataset creation and evaluation for effective polysemy disambiguation in low-resource settings and transfer studies. The released datasets and code aim to support further research into fair, robust, and truly multilingual NLP.
%R 10.18653/v1/2025.sigtyp-1.7
%U https://aclanthology.org/2025.sigtyp-1.7/
%U https://doi.org/10.18653/v1/2025.sigtyp-1.7
%P 61-74
Markdown (Informal)
[SenWiCh: Sense-Annotation of Low-Resource Languages for WiC using Hybrid Methods](https://aclanthology.org/2025.sigtyp-1.7/) (Goworek et al., SIGTYP 2025)
ACL
- Roksana Goworek, Harpal Singh Karlcut, Hamza Shezad, Nijaguna Darshana, Abhishek Mane, Syam Bondada, Raghav Sikka, Ulvi Mammadov, Rauf Allahverdiyev, Sriram Satkirti Purighella, Paridhi Gupta, Muhinyia Ndegwa, Bao Khanh Tran, and Haim Dubossarsky. 2025. SenWiCh: Sense-Annotation of Low-Resource Languages for WiC using Hybrid Methods. In Proceedings of the 7th Workshop on Research in Computational Linguistic Typology and Multilingual NLP, pages 61–74, Vienna, Austria. Association for Computational Linguistics.