@inproceedings{mohammed-khalil-2025-athar,
title = "{ATHAR}: A High-Quality and Diverse Dataset for Classical {A}rabic to {E}nglish Translation",
author = "Mohammed, Mohammed Sabry and
Khalil, Mohammed",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.arabicnlp-main.8/",
pages = "97--106",
ISBN = "979-8-89176-352-4",
abstract = "Classical Arabic represents a significant era that encompasses the golden age of Arab culture, philosophy, and scientific literature. With a broad consensus on the importance of translating these literatures to enrich knowledge dissemination across communities, the advent of large language models (LLMs) and translation systems offers promising tools to facilitate this goal. However, we have identified a scarcity of translation datasets in Classical Arabic, which are often limited in scope and topics, hindering the development of high-quality translation systems. In response, we present the ATHAR dataset, which comprises 66,000 high-quality classical Arabic to English translation samples that cover a wide array of topics including science, culture, and philosophy. Furthermore, we assess the performance of current state-of-the-art LLMs under various settings, concluding that there is a need for such datasets in current systems. Our findings highlight how models can benefit from fine-tuning or incorporating this dataset into their pretraining pipelines. The dataset is publicly available on the HuggingFace Data Hub. To preserve anonymity during review, we additionally provide an anonymized snapshot at https://drive.google.com/drive/folders/1c{\_}ElsblaOJzQ0TW{\_}M1DugjR2o3Xv9RUo."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohammed-khalil-2025-athar">
<titleInfo>
<title>ATHAR: A High-Quality and Diverse Dataset for Classical Arabic to English Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="given">Sabry</namePart>
<namePart type="family">Mohammed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="family">Khalil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Third Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharefah</namePart>
<namePart type="family">Al-Ghamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakhar</namePart>
<namePart type="family">Alkhereyf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaid</namePart>
<namePart type="family">Alyafeai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Areeb</namePart>
<namePart type="family">Alowisheq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Go</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waad</namePart>
<namePart type="family">Alshammari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-352-4</identifier>
</relatedItem>
<abstract>Classical Arabic represents a significant era that encompasses the golden age of Arab culture, philosophy, and scientific literature. With a broad consensus on the importance of translating these literatures to enrich knowledge dissemination across communities, the advent of large language models (LLMs) and translation systems offers promising tools to facilitate this goal. However, we have identified a scarcity of translation datasets in Classical Arabic, which are often limited in scope and topics, hindering the development of high-quality translation systems. In response, we present the ATHAR dataset, which comprises 66,000 high-quality classical Arabic to English translation samples that cover a wide array of topics including science, culture, and philosophy. Furthermore, we assess the performance of current state-of-the-art LLMs under various settings, concluding that there is a need for such datasets in current systems. Our findings highlight how models can benefit from fine-tuning or incorporating this dataset into their pretraining pipelines. The dataset is publicly available on the HuggingFace Data Hub. To preserve anonymity during review, we additionally provide an anonymized snapshot at https://drive.google.com/drive/folders/1c_ElsblaOJzQ0TW_M1DugjR2o3Xv9RUo.</abstract>
<identifier type="citekey">mohammed-khalil-2025-athar</identifier>
<location>
<url>https://aclanthology.org/2025.arabicnlp-main.8/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>97</start>
<end>106</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ATHAR: A High-Quality and Diverse Dataset for Classical Arabic to English Translation
%A Mohammed, Mohammed Sabry
%A Khalil, Mohammed
%Y Darwish, Kareem
%Y Ali, Ahmed
%Y Abu Farha, Ibrahim
%Y Touileb, Samia
%Y Zitouni, Imed
%Y Abdelali, Ahmed
%Y Al-Ghamdi, Sharefah
%Y Alkhereyf, Sakhar
%Y Zaghouani, Wajdi
%Y Khalifa, Salam
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Hamed, Injy
%Y Alyafeai, Zaid
%Y Alowisheq, Areeb
%Y Inoue, Go
%Y Mrini, Khalil
%Y Alshammari, Waad
%S Proceedings of The Third Arabic Natural Language Processing Conference
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-352-4
%F mohammed-khalil-2025-athar
%X Classical Arabic represents a significant era that encompasses the golden age of Arab culture, philosophy, and scientific literature. With a broad consensus on the importance of translating these literatures to enrich knowledge dissemination across communities, the advent of large language models (LLMs) and translation systems offers promising tools to facilitate this goal. However, we have identified a scarcity of translation datasets in Classical Arabic, which are often limited in scope and topics, hindering the development of high-quality translation systems. In response, we present the ATHAR dataset, which comprises 66,000 high-quality classical Arabic to English translation samples that cover a wide array of topics including science, culture, and philosophy. Furthermore, we assess the performance of current state-of-the-art LLMs under various settings, concluding that there is a need for such datasets in current systems. Our findings highlight how models can benefit from fine-tuning or incorporating this dataset into their pretraining pipelines. The dataset is publicly available on the HuggingFace Data Hub. To preserve anonymity during review, we additionally provide an anonymized snapshot at https://drive.google.com/drive/folders/1c_ElsblaOJzQ0TW_M1DugjR2o3Xv9RUo.
%U https://aclanthology.org/2025.arabicnlp-main.8/
%P 97-106
Markdown (Informal)
[ATHAR: A High-Quality and Diverse Dataset for Classical Arabic to English Translation](https://aclanthology.org/2025.arabicnlp-main.8/) (Mohammed & Khalil, ArabicNLP 2025)
ACL