@inproceedings{ebrahimkhani-ansari-2025-perspacor,
title = "{P}er{S}pa{C}or: Correcting Space and {ZWNJ} Errors in {P}ersian Text with Transformer Models",
author = "Ebrahimkhani, Matin and
Ansari, Ebrahim",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.40/",
pages = "325--333",
abstract = "Precision and clarity are essential qualities of written texts; however, Persian script, rooted in Arabic script, presents unique challenges that can compromise readability and correctness. In particular, the use of space and half-space{---}specifically the Zero Width Non-Joiner (ZWNJ){---}is essential for proper character separation in Persian typography. This research introduces four models for correcting spacing and ZWNJ errors at the character level, thereby improving both readability and textual accuracy. By fine-tuning BERT-based transformer models on Bijankhan and Peykare corpora{---}comprising over 12.7 million preprocessed and annotated words{---}and formulating the task as sequence labeling, the best model achieves a macro-average F1-score of 97.26{\%}. An interactive corrector that incorporates user input further improves performance to a macro-average F1-score of 98.38{\%}. These results demonstrate the effectiveness of advanced language models in enhancing Persian text quality and highlight their applicability to real-world natural language processing tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ebrahimkhani-ansari-2025-perspacor">
<titleInfo>
<title>PerSpaCor: Correcting Space and ZWNJ Errors in Persian Text with Transformer Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matin</namePart>
<namePart type="family">Ebrahimkhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ebrahim</namePart>
<namePart type="family">Ansari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Precision and clarity are essential qualities of written texts; however, Persian script, rooted in Arabic script, presents unique challenges that can compromise readability and correctness. In particular, the use of space and half-space—specifically the Zero Width Non-Joiner (ZWNJ)—is essential for proper character separation in Persian typography. This research introduces four models for correcting spacing and ZWNJ errors at the character level, thereby improving both readability and textual accuracy. By fine-tuning BERT-based transformer models on Bijankhan and Peykare corpora—comprising over 12.7 million preprocessed and annotated words—and formulating the task as sequence labeling, the best model achieves a macro-average F1-score of 97.26%. An interactive corrector that incorporates user input further improves performance to a macro-average F1-score of 98.38%. These results demonstrate the effectiveness of advanced language models in enhancing Persian text quality and highlight their applicability to real-world natural language processing tasks.</abstract>
<identifier type="citekey">ebrahimkhani-ansari-2025-perspacor</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.40/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>325</start>
<end>333</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PerSpaCor: Correcting Space and ZWNJ Errors in Persian Text with Transformer Models
%A Ebrahimkhani, Matin
%A Ansari, Ebrahim
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F ebrahimkhani-ansari-2025-perspacor
%X Precision and clarity are essential qualities of written texts; however, Persian script, rooted in Arabic script, presents unique challenges that can compromise readability and correctness. In particular, the use of space and half-space—specifically the Zero Width Non-Joiner (ZWNJ)—is essential for proper character separation in Persian typography. This research introduces four models for correcting spacing and ZWNJ errors at the character level, thereby improving both readability and textual accuracy. By fine-tuning BERT-based transformer models on Bijankhan and Peykare corpora—comprising over 12.7 million preprocessed and annotated words—and formulating the task as sequence labeling, the best model achieves a macro-average F1-score of 97.26%. An interactive corrector that incorporates user input further improves performance to a macro-average F1-score of 98.38%. These results demonstrate the effectiveness of advanced language models in enhancing Persian text quality and highlight their applicability to real-world natural language processing tasks.
%U https://aclanthology.org/2025.ranlp-1.40/
%P 325-333
Markdown (Informal)
[PerSpaCor: Correcting Space and ZWNJ Errors in Persian Text with Transformer Models](https://aclanthology.org/2025.ranlp-1.40/) (Ebrahimkhani & Ansari, RANLP 2025)
ACL