@inproceedings{dubossarsky-dairkee-2024-strengthening,
title = "Strengthening the {W}i{C}: New Polysemy Dataset in {H}indi and Lack of Cross Lingual Transfer",
author = "Dubossarsky, Haim and
Dairkee, Farheen",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1332",
pages = "15341--15349",
abstract = "This study addresses the critical issue of Natural Language Processing in low-resource languages such as Hindi, which, despite having substantial number of speakers, is limited in linguistic resources. The paper focuses on Word Sense Disambiguation, a fundamental NLP task that deals with polysemous words. It introduces a novel Hindi WSD dataset in the modern WiC format, enabling the training and testing of contextualized models. The primary contributions of this work lie in testing the efficacy of multilingual models to transfer across languages and hence to handle polysemy in low-resource languages, and in providing insights into the minimum training data required for a viable solution. Experiments compare different contextualized models on the WiC task via transfer learning from English to Hindi. Models purely transferred from English yield poor 55{\%} accuracy, while fine-tuning on Hindi dramatically improves performance to 90{\%} accuracy. This demonstrates the need for language-specific tuning and resources like the introduced Hindi WiC dataset to drive advances in Hindi NLP. The findings offer valuable insights into addressing the NLP needs of widely spoken yet low-resourced languages, shedding light on the problem of transfer learning in these contexts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dubossarsky-dairkee-2024-strengthening">
<titleInfo>
<title>Strengthening the WiC: New Polysemy Dataset in Hindi and Lack of Cross Lingual Transfer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haim</namePart>
<namePart type="family">Dubossarsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Farheen</namePart>
<namePart type="family">Dairkee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study addresses the critical issue of Natural Language Processing in low-resource languages such as Hindi, which, despite having substantial number of speakers, is limited in linguistic resources. The paper focuses on Word Sense Disambiguation, a fundamental NLP task that deals with polysemous words. It introduces a novel Hindi WSD dataset in the modern WiC format, enabling the training and testing of contextualized models. The primary contributions of this work lie in testing the efficacy of multilingual models to transfer across languages and hence to handle polysemy in low-resource languages, and in providing insights into the minimum training data required for a viable solution. Experiments compare different contextualized models on the WiC task via transfer learning from English to Hindi. Models purely transferred from English yield poor 55% accuracy, while fine-tuning on Hindi dramatically improves performance to 90% accuracy. This demonstrates the need for language-specific tuning and resources like the introduced Hindi WiC dataset to drive advances in Hindi NLP. The findings offer valuable insights into addressing the NLP needs of widely spoken yet low-resourced languages, shedding light on the problem of transfer learning in these contexts.</abstract>
<identifier type="citekey">dubossarsky-dairkee-2024-strengthening</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1332</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>15341</start>
<end>15349</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Strengthening the WiC: New Polysemy Dataset in Hindi and Lack of Cross Lingual Transfer
%A Dubossarsky, Haim
%A Dairkee, Farheen
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F dubossarsky-dairkee-2024-strengthening
%X This study addresses the critical issue of Natural Language Processing in low-resource languages such as Hindi, which, despite having substantial number of speakers, is limited in linguistic resources. The paper focuses on Word Sense Disambiguation, a fundamental NLP task that deals with polysemous words. It introduces a novel Hindi WSD dataset in the modern WiC format, enabling the training and testing of contextualized models. The primary contributions of this work lie in testing the efficacy of multilingual models to transfer across languages and hence to handle polysemy in low-resource languages, and in providing insights into the minimum training data required for a viable solution. Experiments compare different contextualized models on the WiC task via transfer learning from English to Hindi. Models purely transferred from English yield poor 55% accuracy, while fine-tuning on Hindi dramatically improves performance to 90% accuracy. This demonstrates the need for language-specific tuning and resources like the introduced Hindi WiC dataset to drive advances in Hindi NLP. The findings offer valuable insights into addressing the NLP needs of widely spoken yet low-resourced languages, shedding light on the problem of transfer learning in these contexts.
%U https://aclanthology.org/2024.lrec-main.1332
%P 15341-15349
Markdown (Informal)
[Strengthening the WiC: New Polysemy Dataset in Hindi and Lack of Cross Lingual Transfer](https://aclanthology.org/2024.lrec-main.1332) (Dubossarsky & Dairkee, LREC-COLING 2024)
ACL