@inproceedings{tariq-etal-2025-building,
title = "Building a Clean Bartangi Language Corpus and Training Word Embeddings for Low-Resource Language Modeling",
author = "Tariq, Warda and
Popov, Victor and
Gromov, Vasilii",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.145/",
pages = "1256--1262",
abstract = "In this paper, we showcase a comprehensive end-to-end pipeline for creating a superior Bartangi language corpus and using it for training word embeddings. The critically low-resource Pamiri lan- guage of Bartangi, which is spoken in Tajikistan, has difficulties such as morphological complexity, orthographic variety, and a lack of data. In order to overcome these obstacles, we gathered a raw corpus of roughly 6,550 phrases, used the Uniparser-Morph-Bartangi morphological analyzer for linguistically accurate lemmatization, and implemented a thorough cleaning procedure to eliminate noise and ensure proper tokenization. The lemmatized corpus that results greatly lowers word spar- sity and raises the standard of linguistic analysis.The processed corpus was then used to train two different Word2Vec models, Skip-gram and CBOW, with a vector size of 100, a context window of 5, and a minimum frequency threshold of 1. The resultant word embeddings were displayed using dimensionality reduction techniques like PCA and t-SNE, and assessed using intrinsic methods like nearest-neighbor similarity tests. Our tests show that even from tiny datasets, meaningful semantic representations can be obtained by combining informed morphological analysis with clean prepro- cessing. One of the earliest computational datasets for Bartangi, this resource serves as a vital basis for upcoming NLP tasks, such as language modeling, semantic analysis, and low-resource machine translation. To promote more research in Pamiri and other under-represented languages, we make the corpus, lemmatizer pipeline, and trained embeddings publicly available."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tariq-etal-2025-building">
<titleInfo>
<title>Building a Clean Bartangi Language Corpus and Training Word Embeddings for Low-Resource Language Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Warda</namePart>
<namePart type="family">Tariq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Victor</namePart>
<namePart type="family">Popov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasilii</namePart>
<namePart type="family">Gromov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we showcase a comprehensive end-to-end pipeline for creating a superior Bartangi language corpus and using it for training word embeddings. The critically low-resource Pamiri lan- guage of Bartangi, which is spoken in Tajikistan, has difficulties such as morphological complexity, orthographic variety, and a lack of data. In order to overcome these obstacles, we gathered a raw corpus of roughly 6,550 phrases, used the Uniparser-Morph-Bartangi morphological analyzer for linguistically accurate lemmatization, and implemented a thorough cleaning procedure to eliminate noise and ensure proper tokenization. The lemmatized corpus that results greatly lowers word spar- sity and raises the standard of linguistic analysis.The processed corpus was then used to train two different Word2Vec models, Skip-gram and CBOW, with a vector size of 100, a context window of 5, and a minimum frequency threshold of 1. The resultant word embeddings were displayed using dimensionality reduction techniques like PCA and t-SNE, and assessed using intrinsic methods like nearest-neighbor similarity tests. Our tests show that even from tiny datasets, meaningful semantic representations can be obtained by combining informed morphological analysis with clean prepro- cessing. One of the earliest computational datasets for Bartangi, this resource serves as a vital basis for upcoming NLP tasks, such as language modeling, semantic analysis, and low-resource machine translation. To promote more research in Pamiri and other under-represented languages, we make the corpus, lemmatizer pipeline, and trained embeddings publicly available.</abstract>
<identifier type="citekey">tariq-etal-2025-building</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.145/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>1256</start>
<end>1262</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building a Clean Bartangi Language Corpus and Training Word Embeddings for Low-Resource Language Modeling
%A Tariq, Warda
%A Popov, Victor
%A Gromov, Vasilii
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F tariq-etal-2025-building
%X In this paper, we showcase a comprehensive end-to-end pipeline for creating a superior Bartangi language corpus and using it for training word embeddings. The critically low-resource Pamiri lan- guage of Bartangi, which is spoken in Tajikistan, has difficulties such as morphological complexity, orthographic variety, and a lack of data. In order to overcome these obstacles, we gathered a raw corpus of roughly 6,550 phrases, used the Uniparser-Morph-Bartangi morphological analyzer for linguistically accurate lemmatization, and implemented a thorough cleaning procedure to eliminate noise and ensure proper tokenization. The lemmatized corpus that results greatly lowers word spar- sity and raises the standard of linguistic analysis.The processed corpus was then used to train two different Word2Vec models, Skip-gram and CBOW, with a vector size of 100, a context window of 5, and a minimum frequency threshold of 1. The resultant word embeddings were displayed using dimensionality reduction techniques like PCA and t-SNE, and assessed using intrinsic methods like nearest-neighbor similarity tests. Our tests show that even from tiny datasets, meaningful semantic representations can be obtained by combining informed morphological analysis with clean prepro- cessing. One of the earliest computational datasets for Bartangi, this resource serves as a vital basis for upcoming NLP tasks, such as language modeling, semantic analysis, and low-resource machine translation. To promote more research in Pamiri and other under-represented languages, we make the corpus, lemmatizer pipeline, and trained embeddings publicly available.
%U https://aclanthology.org/2025.ranlp-1.145/
%P 1256-1262
Markdown (Informal)
[Building a Clean Bartangi Language Corpus and Training Word Embeddings for Low-Resource Language Modeling](https://aclanthology.org/2025.ranlp-1.145/) (Tariq et al., RANLP 2025)
ACL