@inproceedings{miao-etal-2024-enhancing,
title = "Enhancing Cross-lingual Sentence Embedding for Low-resource Languages with Word Alignment",
author = "Miao, Zhongtao and
Wu, Qiyu and
Zhao, Kaiyan and
Wu, Zilong and
Tsuruoka, Yoshimasa",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.204",
doi = "10.18653/v1/2024.findings-naacl.204",
pages = "3225--3236",
abstract = "The field of cross-lingual sentence embeddings has recently experienced significant advancements, but research concerning low-resource languages has lagged due to the scarcity of parallel corpora. This paper shows that cross-lingual word representation in low-resource languages is notably under-aligned with that in high-resource languages in current models. To address this, we introduce a novel framework that explicitly aligns words between English and eight low-resource languages, utilizing off-the-shelf word alignment models. This framework incorporates three primary training objectives: aligned word prediction and word translation ranking, along with the widely used translation ranking. We evaluate our approach through experiments on the bitext retrieval task, which demonstrate substantial improvements on sentence embeddings in low-resource languages. In addition, the competitive performance of the proposed model across a broader range of tasks in high-resource languages underscores its practicality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miao-etal-2024-enhancing">
<titleInfo>
<title>Enhancing Cross-lingual Sentence Embedding for Low-resource Languages with Word Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhongtao</namePart>
<namePart type="family">Miao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiyu</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaiyan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zilong</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yoshimasa</namePart>
<namePart type="family">Tsuruoka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The field of cross-lingual sentence embeddings has recently experienced significant advancements, but research concerning low-resource languages has lagged due to the scarcity of parallel corpora. This paper shows that cross-lingual word representation in low-resource languages is notably under-aligned with that in high-resource languages in current models. To address this, we introduce a novel framework that explicitly aligns words between English and eight low-resource languages, utilizing off-the-shelf word alignment models. This framework incorporates three primary training objectives: aligned word prediction and word translation ranking, along with the widely used translation ranking. We evaluate our approach through experiments on the bitext retrieval task, which demonstrate substantial improvements on sentence embeddings in low-resource languages. In addition, the competitive performance of the proposed model across a broader range of tasks in high-resource languages underscores its practicality.</abstract>
<identifier type="citekey">miao-etal-2024-enhancing</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.204</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.204</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3225</start>
<end>3236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Cross-lingual Sentence Embedding for Low-resource Languages with Word Alignment
%A Miao, Zhongtao
%A Wu, Qiyu
%A Zhao, Kaiyan
%A Wu, Zilong
%A Tsuruoka, Yoshimasa
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F miao-etal-2024-enhancing
%X The field of cross-lingual sentence embeddings has recently experienced significant advancements, but research concerning low-resource languages has lagged due to the scarcity of parallel corpora. This paper shows that cross-lingual word representation in low-resource languages is notably under-aligned with that in high-resource languages in current models. To address this, we introduce a novel framework that explicitly aligns words between English and eight low-resource languages, utilizing off-the-shelf word alignment models. This framework incorporates three primary training objectives: aligned word prediction and word translation ranking, along with the widely used translation ranking. We evaluate our approach through experiments on the bitext retrieval task, which demonstrate substantial improvements on sentence embeddings in low-resource languages. In addition, the competitive performance of the proposed model across a broader range of tasks in high-resource languages underscores its practicality.
%R 10.18653/v1/2024.findings-naacl.204
%U https://aclanthology.org/2024.findings-naacl.204
%U https://doi.org/10.18653/v1/2024.findings-naacl.204
%P 3225-3236
Markdown (Informal)
[Enhancing Cross-lingual Sentence Embedding for Low-resource Languages with Word Alignment](https://aclanthology.org/2024.findings-naacl.204) (Miao et al., Findings 2024)
ACL