@inproceedings{kurbonovich-2026-character,
title = "Character-Level Transformer for {T}ajik{--}{P}ersian Transliteration with a Parallel Lexical Corpus",
author = "Kurbonovich, Arabov Mullosharaf",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.10/",
pages = "75--83",
abstract = "This study addresses automatic transliteration from Tajik (Cyrillic script) to Persian (Perso-Arabic script). We present a curated, lexicographically verified parallel corpus of 52,152 Tajik{--}Persian words and short phrases, compiled from printed dictionaries, encyclopedic sources, and manually verified online resources. To the best of our knowledge, this is one of the largest publicly available word-level corpora for Tajik{--}Persian transliteration. Using this corpus, we train a character-level sequence-to-sequence Transformer model and evaluate it using Character Error Rate (CER) and exact-match accuracy. The best Transformer configuration with beam search (k=3) achieves a CER of 0.3182 and an exact-match accuracy of 0.3215, achieving lower error rates than dictionary-based rule-based and recurrent neural baselines. We describe the data collection and preprocessing pipeline, model architecture, and experimental protocol, and report a part-of-speech analysis showing performance differences across lexical categories. All resources (dataset, preprocessing scripts, splits, and training configurations) will be released publicly to ensure reproducibility and facilitate future work on Tajik{--}Persian transliteration, cross-script NLP, and lexicographic applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kurbonovich-2026-character">
<titleInfo>
<title>Character-Level Transformer for Tajik–Persian Transliteration with a Parallel Lexical Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arabov</namePart>
<namePart type="given">Mullosharaf</namePart>
<namePart type="family">Kurbonovich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study addresses automatic transliteration from Tajik (Cyrillic script) to Persian (Perso-Arabic script). We present a curated, lexicographically verified parallel corpus of 52,152 Tajik–Persian words and short phrases, compiled from printed dictionaries, encyclopedic sources, and manually verified online resources. To the best of our knowledge, this is one of the largest publicly available word-level corpora for Tajik–Persian transliteration. Using this corpus, we train a character-level sequence-to-sequence Transformer model and evaluate it using Character Error Rate (CER) and exact-match accuracy. The best Transformer configuration with beam search (k=3) achieves a CER of 0.3182 and an exact-match accuracy of 0.3215, achieving lower error rates than dictionary-based rule-based and recurrent neural baselines. We describe the data collection and preprocessing pipeline, model architecture, and experimental protocol, and report a part-of-speech analysis showing performance differences across lexical categories. All resources (dataset, preprocessing scripts, splits, and training configurations) will be released publicly to ensure reproducibility and facilitate future work on Tajik–Persian transliteration, cross-script NLP, and lexicographic applications.</abstract>
<identifier type="citekey">kurbonovich-2026-character</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.10/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>75</start>
<end>83</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Character-Level Transformer for Tajik–Persian Transliteration with a Parallel Lexical Corpus
%A Kurbonovich, Arabov Mullosharaf
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F kurbonovich-2026-character
%X This study addresses automatic transliteration from Tajik (Cyrillic script) to Persian (Perso-Arabic script). We present a curated, lexicographically verified parallel corpus of 52,152 Tajik–Persian words and short phrases, compiled from printed dictionaries, encyclopedic sources, and manually verified online resources. To the best of our knowledge, this is one of the largest publicly available word-level corpora for Tajik–Persian transliteration. Using this corpus, we train a character-level sequence-to-sequence Transformer model and evaluate it using Character Error Rate (CER) and exact-match accuracy. The best Transformer configuration with beam search (k=3) achieves a CER of 0.3182 and an exact-match accuracy of 0.3215, achieving lower error rates than dictionary-based rule-based and recurrent neural baselines. We describe the data collection and preprocessing pipeline, model architecture, and experimental protocol, and report a part-of-speech analysis showing performance differences across lexical categories. All resources (dataset, preprocessing scripts, splits, and training configurations) will be released publicly to ensure reproducibility and facilitate future work on Tajik–Persian transliteration, cross-script NLP, and lexicographic applications.
%U https://aclanthology.org/2026.abjadnlp-1.10/
%P 75-83
Markdown (Informal)
[Character-Level Transformer for Tajik–Persian Transliteration with a Parallel Lexical Corpus](https://aclanthology.org/2026.abjadnlp-1.10/) (Kurbonovich, AbjadNLP 2026)
ACL