@inproceedings{bear-cook-2023-fine,
title = "Fine-tuning Sentence-{R}o{BERT}a to Construct Word Embeddings for Low-resource Languages from Bilingual Dictionaries",
author = "Bear, Diego and
Cook, Paul",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Oncevay, Arturo and
Rice, Enora and
Rijhwani, Shruti and
Palmer, Alexis and
Kann, Katharina",
booktitle = "Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.americasnlp-1.7",
doi = "10.18653/v1/2023.americasnlp-1.7",
pages = "47--57",
abstract = "Conventional approaches to learning word embeddings (Mikolov et al., 2013; Pennington et al., 2014) are limited to relatively few languages with sufficiently large training corpora. To address this limitation, we propose an alternative approach to deriving word embeddings for Wolastoqey and Mi{'}kmaq that leverages definitions from a bilingual dictionary. More specifically, following Bear and Cook (2022), we experiment with encoding English definitions of Wolastoqey and Mi{'}kmaq words into vector representations using English sequence representation models. For this, we consider using and finetuning sentence-RoBERTa models (Reimers and Gurevych, 2019). We evaluate our word embeddings using a similar methodology to that of Bear and Cook using evaluations based on word classification, clustering and reverse dictionary search. We additionally construct word embeddings for higher-resource languages English, German and Spanishusing our methods and evaluate our embeddings on existing word-similarity datasets. Our findings indicate that our word embedding methods can be used to produce meaningful vector representations for low-resource languages such as Wolastoqey and Mi{'}kmaq and for higher-resource languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bear-cook-2023-fine">
<titleInfo>
<title>Fine-tuning Sentence-RoBERTa to Construct Word Embeddings for Low-resource Languages from Bilingual Dictionaries</title>
</titleInfo>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Bear</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Cook</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enora</namePart>
<namePart type="family">Rice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Kann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Conventional approaches to learning word embeddings (Mikolov et al., 2013; Pennington et al., 2014) are limited to relatively few languages with sufficiently large training corpora. To address this limitation, we propose an alternative approach to deriving word embeddings for Wolastoqey and Mi’kmaq that leverages definitions from a bilingual dictionary. More specifically, following Bear and Cook (2022), we experiment with encoding English definitions of Wolastoqey and Mi’kmaq words into vector representations using English sequence representation models. For this, we consider using and finetuning sentence-RoBERTa models (Reimers and Gurevych, 2019). We evaluate our word embeddings using a similar methodology to that of Bear and Cook using evaluations based on word classification, clustering and reverse dictionary search. We additionally construct word embeddings for higher-resource languages English, German and Spanishusing our methods and evaluate our embeddings on existing word-similarity datasets. Our findings indicate that our word embedding methods can be used to produce meaningful vector representations for low-resource languages such as Wolastoqey and Mi’kmaq and for higher-resource languages.</abstract>
<identifier type="citekey">bear-cook-2023-fine</identifier>
<identifier type="doi">10.18653/v1/2023.americasnlp-1.7</identifier>
<location>
<url>https://aclanthology.org/2023.americasnlp-1.7</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>47</start>
<end>57</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fine-tuning Sentence-RoBERTa to Construct Word Embeddings for Low-resource Languages from Bilingual Dictionaries
%A Bear, Diego
%A Cook, Paul
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Oncevay, Arturo
%Y Rice, Enora
%Y Rijhwani, Shruti
%Y Palmer, Alexis
%Y Kann, Katharina
%S Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F bear-cook-2023-fine
%X Conventional approaches to learning word embeddings (Mikolov et al., 2013; Pennington et al., 2014) are limited to relatively few languages with sufficiently large training corpora. To address this limitation, we propose an alternative approach to deriving word embeddings for Wolastoqey and Mi’kmaq that leverages definitions from a bilingual dictionary. More specifically, following Bear and Cook (2022), we experiment with encoding English definitions of Wolastoqey and Mi’kmaq words into vector representations using English sequence representation models. For this, we consider using and finetuning sentence-RoBERTa models (Reimers and Gurevych, 2019). We evaluate our word embeddings using a similar methodology to that of Bear and Cook using evaluations based on word classification, clustering and reverse dictionary search. We additionally construct word embeddings for higher-resource languages English, German and Spanishusing our methods and evaluate our embeddings on existing word-similarity datasets. Our findings indicate that our word embedding methods can be used to produce meaningful vector representations for low-resource languages such as Wolastoqey and Mi’kmaq and for higher-resource languages.
%R 10.18653/v1/2023.americasnlp-1.7
%U https://aclanthology.org/2023.americasnlp-1.7
%U https://doi.org/10.18653/v1/2023.americasnlp-1.7
%P 47-57
Markdown (Informal)
[Fine-tuning Sentence-RoBERTa to Construct Word Embeddings for Low-resource Languages from Bilingual Dictionaries](https://aclanthology.org/2023.americasnlp-1.7) (Bear & Cook, AmericasNLP 2023)
ACL