@inproceedings{sarvestani-etal-2026-harfosokhan,
title = "{H}arfo{S}okhan: A Comprehensive Parallel Dataset for Transitions between {P}ersian Colloquial and Formal Variations",
author = "Sarvestani, Hamid Jahad and
Ramezanian, Vida and
Saadat, Saee and
Serajeh, Neda Taghizadeh and
Taheri, Maryam Sadat Razavi and
Kasaei, Shohreh and
Fazli, MohammadAmin and
Asgari, Ehsaneddin",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.346/",
pages = "7380--7392",
ISBN = "979-8-89176-380-7",
abstract = "A wide array of NLP/NLU models have been developed for the Persian language and have shown promising results. However, the performance of such models drops significantly when applied to the colloquial form of Persian. This challenge arises from the substantial differences between colloquial and formal Persian and the lack of parallel data facilitating the robustness of the model to the colloquial data or to transform the data to formal Persian. In addressing this gap, our research is dedicated to the development of the HarfoSokhan dataset, a large-scale colloquial to formal Persian parallel dataset of 6M sentence pairs. Our proposed dataset is a critical resource for training models that can effectively bridge the linguistic variations between colloquial and formal Persian. To illustrate the utility of our dataset, we used it to train a GPT2 model, which exhibited remarkable proficiency in colloquial to formal text style transfer, outperforming both OpenAI{'}s GPT-3.5-turbo model and a leading rule-based system in this task. This conclusion is supported by our proposed ranking-based human evaluation. The results underscore the significance of the HarfoSokhan dataset in enhancing the performance of natural language processing models in the challenging task of colloquial to formal Persian conversion."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sarvestani-etal-2026-harfosokhan">
<titleInfo>
<title>HarfoSokhan: A Comprehensive Parallel Dataset for Transitions between Persian Colloquial and Formal Variations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hamid</namePart>
<namePart type="given">Jahad</namePart>
<namePart type="family">Sarvestani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vida</namePart>
<namePart type="family">Ramezanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saee</namePart>
<namePart type="family">Saadat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neda</namePart>
<namePart type="given">Taghizadeh</namePart>
<namePart type="family">Serajeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maryam</namePart>
<namePart type="given">Sadat</namePart>
<namePart type="given">Razavi</namePart>
<namePart type="family">Taheri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shohreh</namePart>
<namePart type="family">Kasaei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">MohammadAmin</namePart>
<namePart type="family">Fazli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehsaneddin</namePart>
<namePart type="family">Asgari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>A wide array of NLP/NLU models have been developed for the Persian language and have shown promising results. However, the performance of such models drops significantly when applied to the colloquial form of Persian. This challenge arises from the substantial differences between colloquial and formal Persian and the lack of parallel data facilitating the robustness of the model to the colloquial data or to transform the data to formal Persian. In addressing this gap, our research is dedicated to the development of the HarfoSokhan dataset, a large-scale colloquial to formal Persian parallel dataset of 6M sentence pairs. Our proposed dataset is a critical resource for training models that can effectively bridge the linguistic variations between colloquial and formal Persian. To illustrate the utility of our dataset, we used it to train a GPT2 model, which exhibited remarkable proficiency in colloquial to formal text style transfer, outperforming both OpenAI’s GPT-3.5-turbo model and a leading rule-based system in this task. This conclusion is supported by our proposed ranking-based human evaluation. The results underscore the significance of the HarfoSokhan dataset in enhancing the performance of natural language processing models in the challenging task of colloquial to formal Persian conversion.</abstract>
<identifier type="citekey">sarvestani-etal-2026-harfosokhan</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.346/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>7380</start>
<end>7392</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HarfoSokhan: A Comprehensive Parallel Dataset for Transitions between Persian Colloquial and Formal Variations
%A Sarvestani, Hamid Jahad
%A Ramezanian, Vida
%A Saadat, Saee
%A Serajeh, Neda Taghizadeh
%A Taheri, Maryam Sadat Razavi
%A Kasaei, Shohreh
%A Fazli, MohammadAmin
%A Asgari, Ehsaneddin
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F sarvestani-etal-2026-harfosokhan
%X A wide array of NLP/NLU models have been developed for the Persian language and have shown promising results. However, the performance of such models drops significantly when applied to the colloquial form of Persian. This challenge arises from the substantial differences between colloquial and formal Persian and the lack of parallel data facilitating the robustness of the model to the colloquial data or to transform the data to formal Persian. In addressing this gap, our research is dedicated to the development of the HarfoSokhan dataset, a large-scale colloquial to formal Persian parallel dataset of 6M sentence pairs. Our proposed dataset is a critical resource for training models that can effectively bridge the linguistic variations between colloquial and formal Persian. To illustrate the utility of our dataset, we used it to train a GPT2 model, which exhibited remarkable proficiency in colloquial to formal text style transfer, outperforming both OpenAI’s GPT-3.5-turbo model and a leading rule-based system in this task. This conclusion is supported by our proposed ranking-based human evaluation. The results underscore the significance of the HarfoSokhan dataset in enhancing the performance of natural language processing models in the challenging task of colloquial to formal Persian conversion.
%U https://aclanthology.org/2026.eacl-long.346/
%P 7380-7392
Markdown (Informal)
[HarfoSokhan: A Comprehensive Parallel Dataset for Transitions between Persian Colloquial and Formal Variations](https://aclanthology.org/2026.eacl-long.346/) (Sarvestani et al., EACL 2026)
ACL
- Hamid Jahad Sarvestani, Vida Ramezanian, Saee Saadat, Neda Taghizadeh Serajeh, Maryam Sadat Razavi Taheri, Shohreh Kasaei, MohammadAmin Fazli, and Ehsaneddin Asgari. 2026. HarfoSokhan: A Comprehensive Parallel Dataset for Transitions between Persian Colloquial and Formal Variations. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 7380–7392, Rabat, Morocco. Association for Computational Linguistics.