@inproceedings{kalahroodi-etal-2026-persianpunc,
title = "{P}ersian{P}unc: A Large-Scale Dataset and {BERT}-Based Approach for {P}ersian Punctuation Restoration",
author = "Kalahroodi, Mohammad Javad Ranjbar and
Faili, Heshaam and
Shakery, Azadeh",
editor = "Merchant, Rayyan and
Megerdoomian, Karine",
booktitle = "The Proceedings of the First Workshop on {NLP} and {LLM}s for the {I}ranian Language Family",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.silkroadnlp-1.11/",
pages = "105--113",
ISBN = "979-8-89176-371-5",
abstract = "Punctuation restoration is essential for improving the readability and downstream utility of automatic speech recognition (ASR) outputs, yet remains underexplored for Persian despite its importance. We introduce PersianPunc, a large-scale, high-quality dataset of 17 million samples for Persian punctuation restoration, constructed through systematic aggregation and filtering of existing textual resources. We formulate punctuation restoration as a token-level sequence labeling task and fine-tune ParsBERT to achieve strong performance. Through comparative evaluation, we demonstrate that while large language models can perform punctuation restoration, they suffer from critical limitations: over-correction tendencies that introduce undesired edits beyond punctuation insertion (particularly problematic for speech-to-text pipelines) and substantially higher computational requirements. Our lightweight BERT-based approach achieves a macro-averaged F1 score of 91.33{\%} on our test set while maintaining efficiency suitable for real-time applications. We make our dataset and model publicly available to facilitate future research in Persian NLP and provide a scalable framework applicable to other morphologically rich, low-resource languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kalahroodi-etal-2026-persianpunc">
<titleInfo>
<title>PersianPunc: A Large-Scale Dataset and BERT-Based Approach for Persian Punctuation Restoration</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Javad</namePart>
<namePart type="given">Ranjbar</namePart>
<namePart type="family">Kalahroodi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heshaam</namePart>
<namePart type="family">Faili</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Azadeh</namePart>
<namePart type="family">Shakery</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The Proceedings of the First Workshop on NLP and LLMs for the Iranian Language Family</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rayyan</namePart>
<namePart type="family">Merchant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karine</namePart>
<namePart type="family">Megerdoomian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-371-5</identifier>
</relatedItem>
<abstract>Punctuation restoration is essential for improving the readability and downstream utility of automatic speech recognition (ASR) outputs, yet remains underexplored for Persian despite its importance. We introduce PersianPunc, a large-scale, high-quality dataset of 17 million samples for Persian punctuation restoration, constructed through systematic aggregation and filtering of existing textual resources. We formulate punctuation restoration as a token-level sequence labeling task and fine-tune ParsBERT to achieve strong performance. Through comparative evaluation, we demonstrate that while large language models can perform punctuation restoration, they suffer from critical limitations: over-correction tendencies that introduce undesired edits beyond punctuation insertion (particularly problematic for speech-to-text pipelines) and substantially higher computational requirements. Our lightweight BERT-based approach achieves a macro-averaged F1 score of 91.33% on our test set while maintaining efficiency suitable for real-time applications. We make our dataset and model publicly available to facilitate future research in Persian NLP and provide a scalable framework applicable to other morphologically rich, low-resource languages.</abstract>
<identifier type="citekey">kalahroodi-etal-2026-persianpunc</identifier>
<location>
<url>https://aclanthology.org/2026.silkroadnlp-1.11/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>105</start>
<end>113</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PersianPunc: A Large-Scale Dataset and BERT-Based Approach for Persian Punctuation Restoration
%A Kalahroodi, Mohammad Javad Ranjbar
%A Faili, Heshaam
%A Shakery, Azadeh
%Y Merchant, Rayyan
%Y Megerdoomian, Karine
%S The Proceedings of the First Workshop on NLP and LLMs for the Iranian Language Family
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-371-5
%F kalahroodi-etal-2026-persianpunc
%X Punctuation restoration is essential for improving the readability and downstream utility of automatic speech recognition (ASR) outputs, yet remains underexplored for Persian despite its importance. We introduce PersianPunc, a large-scale, high-quality dataset of 17 million samples for Persian punctuation restoration, constructed through systematic aggregation and filtering of existing textual resources. We formulate punctuation restoration as a token-level sequence labeling task and fine-tune ParsBERT to achieve strong performance. Through comparative evaluation, we demonstrate that while large language models can perform punctuation restoration, they suffer from critical limitations: over-correction tendencies that introduce undesired edits beyond punctuation insertion (particularly problematic for speech-to-text pipelines) and substantially higher computational requirements. Our lightweight BERT-based approach achieves a macro-averaged F1 score of 91.33% on our test set while maintaining efficiency suitable for real-time applications. We make our dataset and model publicly available to facilitate future research in Persian NLP and provide a scalable framework applicable to other morphologically rich, low-resource languages.
%U https://aclanthology.org/2026.silkroadnlp-1.11/
%P 105-113
Markdown (Informal)
[PersianPunc: A Large-Scale Dataset and BERT-Based Approach for Persian Punctuation Restoration](https://aclanthology.org/2026.silkroadnlp-1.11/) (Kalahroodi et al., SilkRoadNLP 2026)
ACL