@inproceedings{lim-liew-2022-english,
title = "{E}nglish-{M}alay Cross-Lingual Embedding Alignment using Bilingual Lexicon Augmentation",
author = "Lim, Ying Hao and
Liew, Jasy Suet Yan",
editor = "Louvan, Samuel and
Madotto, Andrea and
Madureira, Brielen",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.acl-srw.16/",
doi = "10.18653/v1/2022.acl-srw.16",
pages = "229--238",
abstract = "As high-quality Malay language resources are still a scarcity, cross lingual word embeddings make it possible for richer English resources to be leveraged for downstream Malay text classification tasks. This paper focuses on creating an English-Malay cross-lingual word embeddings using embedding alignment by exploiting existing language resources. We augmented the training bilingual lexicons using machine translation with the goal to improve the alignment precision of our cross-lingual word embeddings. We investigated the quality of the current state-of-the-art English-Malay bilingual lexicon and worked on improving its quality using Google Translate. We also examined the effect of Malay word coverage on the quality of cross-lingual word embeddings. Experimental results with a precision up till 28.17{\%} show that the alignment precision of the cross-lingual word embeddings would inevitably degrade after 1-NN but a better seed lexicon and cleaner nearest neighbours can reduce the number of word pairs required to achieve satisfactory performance. As the English and Malay monolingual embeddings are pre-trained on informal language corpora, our proposed English-Malay embeddings alignment approach is also able to map non-standard Malay translations in the English nearest neighbours."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lim-liew-2022-english">
<titleInfo>
<title>English-Malay Cross-Lingual Embedding Alignment using Bilingual Lexicon Augmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="given">Hao</namePart>
<namePart type="family">Lim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jasy</namePart>
<namePart type="given">Suet</namePart>
<namePart type="given">Yan</namePart>
<namePart type="family">Liew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Louvan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Madotto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brielen</namePart>
<namePart type="family">Madureira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As high-quality Malay language resources are still a scarcity, cross lingual word embeddings make it possible for richer English resources to be leveraged for downstream Malay text classification tasks. This paper focuses on creating an English-Malay cross-lingual word embeddings using embedding alignment by exploiting existing language resources. We augmented the training bilingual lexicons using machine translation with the goal to improve the alignment precision of our cross-lingual word embeddings. We investigated the quality of the current state-of-the-art English-Malay bilingual lexicon and worked on improving its quality using Google Translate. We also examined the effect of Malay word coverage on the quality of cross-lingual word embeddings. Experimental results with a precision up till 28.17% show that the alignment precision of the cross-lingual word embeddings would inevitably degrade after 1-NN but a better seed lexicon and cleaner nearest neighbours can reduce the number of word pairs required to achieve satisfactory performance. As the English and Malay monolingual embeddings are pre-trained on informal language corpora, our proposed English-Malay embeddings alignment approach is also able to map non-standard Malay translations in the English nearest neighbours.</abstract>
<identifier type="citekey">lim-liew-2022-english</identifier>
<identifier type="doi">10.18653/v1/2022.acl-srw.16</identifier>
<location>
<url>https://aclanthology.org/2022.acl-srw.16/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>229</start>
<end>238</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T English-Malay Cross-Lingual Embedding Alignment using Bilingual Lexicon Augmentation
%A Lim, Ying Hao
%A Liew, Jasy Suet Yan
%Y Louvan, Samuel
%Y Madotto, Andrea
%Y Madureira, Brielen
%S Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F lim-liew-2022-english
%X As high-quality Malay language resources are still a scarcity, cross lingual word embeddings make it possible for richer English resources to be leveraged for downstream Malay text classification tasks. This paper focuses on creating an English-Malay cross-lingual word embeddings using embedding alignment by exploiting existing language resources. We augmented the training bilingual lexicons using machine translation with the goal to improve the alignment precision of our cross-lingual word embeddings. We investigated the quality of the current state-of-the-art English-Malay bilingual lexicon and worked on improving its quality using Google Translate. We also examined the effect of Malay word coverage on the quality of cross-lingual word embeddings. Experimental results with a precision up till 28.17% show that the alignment precision of the cross-lingual word embeddings would inevitably degrade after 1-NN but a better seed lexicon and cleaner nearest neighbours can reduce the number of word pairs required to achieve satisfactory performance. As the English and Malay monolingual embeddings are pre-trained on informal language corpora, our proposed English-Malay embeddings alignment approach is also able to map non-standard Malay translations in the English nearest neighbours.
%R 10.18653/v1/2022.acl-srw.16
%U https://aclanthology.org/2022.acl-srw.16/
%U https://doi.org/10.18653/v1/2022.acl-srw.16
%P 229-238
Markdown (Informal)
[English-Malay Cross-Lingual Embedding Alignment using Bilingual Lexicon Augmentation](https://aclanthology.org/2022.acl-srw.16/) (Lim & Liew, ACL 2022)
ACL