@inproceedings{riley-etal-2025-semi,
title = "Semi-Automated Construction of Sense-Annotated Datasets for Practically Any Language",
author = "Riley, Jai and
Hauer, Bradley M. and
Hriti, Nafisa Sadaf and
Luo, Guoqing and
Mirzaei, Amir Reza and
Rafiei, Ali and
Sheikhi, Hadi and
Siavashpour, Mahvash and
Tavakoli, Mohammad and
Shi, Ning and
Kondrak, Grzegorz",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.419/",
pages = "6270--6284",
abstract = "High-quality sense-annotated datasets are vital for evaluating and comparing WSD systems. We present a novel approach to creating parallel sense-annotated datasets, which can be applied to any language that English can be translated into. The method incorporates machine translation, word alignment, sense projection, and sense filtering to produce silver annotations, which can then be revised manually to obtain gold datasets. By applying our method to Farsi, Chinese, and Bengali, we produce new parallel benchmark datasets, which are vetted by native speakers of each language. Our automatically-generated silver datasets are of higher quality than the annotations obtained with recent multilingual WSD systems, particularly on non-European languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="riley-etal-2025-semi">
<titleInfo>
<title>Semi-Automated Construction of Sense-Annotated Datasets for Practically Any Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jai</namePart>
<namePart type="family">Riley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bradley</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Hauer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nafisa</namePart>
<namePart type="given">Sadaf</namePart>
<namePart type="family">Hriti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guoqing</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="given">Reza</namePart>
<namePart type="family">Mirzaei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Rafiei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadi</namePart>
<namePart type="family">Sheikhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mahvash</namePart>
<namePart type="family">Siavashpour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Tavakoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ning</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Grzegorz</namePart>
<namePart type="family">Kondrak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>High-quality sense-annotated datasets are vital for evaluating and comparing WSD systems. We present a novel approach to creating parallel sense-annotated datasets, which can be applied to any language that English can be translated into. The method incorporates machine translation, word alignment, sense projection, and sense filtering to produce silver annotations, which can then be revised manually to obtain gold datasets. By applying our method to Farsi, Chinese, and Bengali, we produce new parallel benchmark datasets, which are vetted by native speakers of each language. Our automatically-generated silver datasets are of higher quality than the annotations obtained with recent multilingual WSD systems, particularly on non-European languages.</abstract>
<identifier type="citekey">riley-etal-2025-semi</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.419/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6270</start>
<end>6284</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semi-Automated Construction of Sense-Annotated Datasets for Practically Any Language
%A Riley, Jai
%A Hauer, Bradley M.
%A Hriti, Nafisa Sadaf
%A Luo, Guoqing
%A Mirzaei, Amir Reza
%A Rafiei, Ali
%A Sheikhi, Hadi
%A Siavashpour, Mahvash
%A Tavakoli, Mohammad
%A Shi, Ning
%A Kondrak, Grzegorz
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F riley-etal-2025-semi
%X High-quality sense-annotated datasets are vital for evaluating and comparing WSD systems. We present a novel approach to creating parallel sense-annotated datasets, which can be applied to any language that English can be translated into. The method incorporates machine translation, word alignment, sense projection, and sense filtering to produce silver annotations, which can then be revised manually to obtain gold datasets. By applying our method to Farsi, Chinese, and Bengali, we produce new parallel benchmark datasets, which are vetted by native speakers of each language. Our automatically-generated silver datasets are of higher quality than the annotations obtained with recent multilingual WSD systems, particularly on non-European languages.
%U https://aclanthology.org/2025.coling-main.419/
%P 6270-6284
Markdown (Informal)
[Semi-Automated Construction of Sense-Annotated Datasets for Practically Any Language](https://aclanthology.org/2025.coling-main.419/) (Riley et al., COLING 2025)
ACL
- Jai Riley, Bradley M. Hauer, Nafisa Sadaf Hriti, Guoqing Luo, Amir Reza Mirzaei, Ali Rafiei, Hadi Sheikhi, Mahvash Siavashpour, Mohammad Tavakoli, Ning Shi, and Grzegorz Kondrak. 2025. Semi-Automated Construction of Sense-Annotated Datasets for Practically Any Language. In Proceedings of the 31st International Conference on Computational Linguistics, pages 6270–6284, Abu Dhabi, UAE. Association for Computational Linguistics.