@inproceedings{tashu-tudor-2025-mapping,
title = "Mapping Cross-Lingual Sentence Representations for Low-Resource Language Pairs Using Pre-trained Language Models",
author = "Tashu, Tsegaye Misikir and
Tudor, Andreea Ioana",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the First Workshop on Language Models for Low-Resource Languages",
month = jan,
year = "2025",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.loreslm-1.20/",
pages = "249--257",
abstract = "In this work, we explore different linear mapping techniques to learn cross-lingual document representations from pre-trained multilingual large language models for low-resource languages. Three different mapping techniques namely Linear Concept Approximation (LCA), Linear Concept Compression (LCC), and Neural Concept Approximation (NCA) and four multilingual language models such as mBERT, mT5, XLM-R, and ErnieM were used to extract embeddings. The inter-lingual representations were created mappings the monolingual representation extracted from multilingual language models. The experimental results showed that LCA and LCC significantly outperform NCA, with models like ErnieM achieving the highest alignment quality. Language pairs exhibit variable performance, influenced by linguistic similarity and data availability, with the Amharic-English pair yielding particularly high scores. The results showed the utility of LCA and LCC in enabling cross-lingual tasks for low-resource languages."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tashu-tudor-2025-mapping">
<titleInfo>
<title>Mapping Cross-Lingual Sentence Representations for Low-Resource Language Pairs Using Pre-trained Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tsegaye</namePart>
<namePart type="given">Misikir</namePart>
<namePart type="family">Tashu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreea</namePart>
<namePart type="given">Ioana</namePart>
<namePart type="family">Tudor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Language Models for Low-Resource Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we explore different linear mapping techniques to learn cross-lingual document representations from pre-trained multilingual large language models for low-resource languages. Three different mapping techniques namely Linear Concept Approximation (LCA), Linear Concept Compression (LCC), and Neural Concept Approximation (NCA) and four multilingual language models such as mBERT, mT5, XLM-R, and ErnieM were used to extract embeddings. The inter-lingual representations were created mappings the monolingual representation extracted from multilingual language models. The experimental results showed that LCA and LCC significantly outperform NCA, with models like ErnieM achieving the highest alignment quality. Language pairs exhibit variable performance, influenced by linguistic similarity and data availability, with the Amharic-English pair yielding particularly high scores. The results showed the utility of LCA and LCC in enabling cross-lingual tasks for low-resource languages.</abstract>
<identifier type="citekey">tashu-tudor-2025-mapping</identifier>
<location>
<url>https://aclanthology.org/2025.loreslm-1.20/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>249</start>
<end>257</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mapping Cross-Lingual Sentence Representations for Low-Resource Language Pairs Using Pre-trained Language Models
%A Tashu, Tsegaye Misikir
%A Tudor, Andreea Ioana
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the First Workshop on Language Models for Low-Resource Languages
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F tashu-tudor-2025-mapping
%X In this work, we explore different linear mapping techniques to learn cross-lingual document representations from pre-trained multilingual large language models for low-resource languages. Three different mapping techniques namely Linear Concept Approximation (LCA), Linear Concept Compression (LCC), and Neural Concept Approximation (NCA) and four multilingual language models such as mBERT, mT5, XLM-R, and ErnieM were used to extract embeddings. The inter-lingual representations were created mappings the monolingual representation extracted from multilingual language models. The experimental results showed that LCA and LCC significantly outperform NCA, with models like ErnieM achieving the highest alignment quality. Language pairs exhibit variable performance, influenced by linguistic similarity and data availability, with the Amharic-English pair yielding particularly high scores. The results showed the utility of LCA and LCC in enabling cross-lingual tasks for low-resource languages.
%U https://aclanthology.org/2025.loreslm-1.20/
%P 249-257
Markdown (Informal)
[Mapping Cross-Lingual Sentence Representations for Low-Resource Language Pairs Using Pre-trained Language Models](https://aclanthology.org/2025.loreslm-1.20/) (Tashu & Tudor, LoResLM 2025)
ACL