@inproceedings{alasmary-etal-2024-catt,
title = "{CATT}: Character-based {A}rabic Tashkeel Transformer",
author = "Alasmary, Faris and
Zaafarani, Orjuwan and
Ghannam, Ahmad",
editor = "Habash, Nizar and
Bouamor, Houda and
Eskander, Ramy and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Abdelali, Ahmed and
Touileb, Samia and
Hamed, Injy and
Onaizan, Yaser and
Alhafni, Bashar and
Antoun, Wissam and
Khalifa, Salam and
Haddad, Hatem and
Zitouni, Imed and
AlKhamissi, Badr and
Almatham, Rawan and
Mrini, Khalil",
booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.arabicnlp-1.21",
doi = "10.18653/v1/2024.arabicnlp-1.21",
pages = "250--257",
abstract = "Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the comprehension of Arabic text by removing ambiguity and minimizing the risk of misinterpretations caused by its absence.It plays a crucial role in improving Arabic text processing, particularly in applications such as text-to-speech and machine translation.This paper introduces a new approach to training ATD models.First, we finetuned two transformers, encoder-only and encoder-decoder, that were initialized from a pretrained character-based BERT.Then, we applied the Noisy-Student approach to boost the performance of the best model.We evaluated our models alongside 11 commercial and open-source models using two manually labeled benchmark datasets: WikiNews and our CATT dataset.Our findings show that our top model surpasses all evaluated models by relative Diacritic Error Rates (DERs) of 30.83{\%} and 35.21{\%} on WikiNews and CATT, respectively, achieving state-of-the-art in ATD.In addition, we show that our model outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36{\%}.We open-source our CATT models and benchmark dataset for the research community .",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alasmary-etal-2024-catt">
<titleInfo>
<title>CATT: Character-based Arabic Tashkeel Transformer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Faris</namePart>
<namePart type="family">Alasmary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Orjuwan</namePart>
<namePart type="family">Zaafarani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Ghannam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Second Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramy</namePart>
<namePart type="family">Eskander</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadi</namePart>
<namePart type="family">Tomeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bashar</namePart>
<namePart type="family">Alhafni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wissam</namePart>
<namePart type="family">Antoun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hatem</namePart>
<namePart type="family">Haddad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the comprehension of Arabic text by removing ambiguity and minimizing the risk of misinterpretations caused by its absence.It plays a crucial role in improving Arabic text processing, particularly in applications such as text-to-speech and machine translation.This paper introduces a new approach to training ATD models.First, we finetuned two transformers, encoder-only and encoder-decoder, that were initialized from a pretrained character-based BERT.Then, we applied the Noisy-Student approach to boost the performance of the best model.We evaluated our models alongside 11 commercial and open-source models using two manually labeled benchmark datasets: WikiNews and our CATT dataset.Our findings show that our top model surpasses all evaluated models by relative Diacritic Error Rates (DERs) of 30.83% and 35.21% on WikiNews and CATT, respectively, achieving state-of-the-art in ATD.In addition, we show that our model outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36%.We open-source our CATT models and benchmark dataset for the research community .</abstract>
<identifier type="citekey">alasmary-etal-2024-catt</identifier>
<identifier type="doi">10.18653/v1/2024.arabicnlp-1.21</identifier>
<location>
<url>https://aclanthology.org/2024.arabicnlp-1.21</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>250</start>
<end>257</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CATT: Character-based Arabic Tashkeel Transformer
%A Alasmary, Faris
%A Zaafarani, Orjuwan
%A Ghannam, Ahmad
%Y Habash, Nizar
%Y Bouamor, Houda
%Y Eskander, Ramy
%Y Tomeh, Nadi
%Y Abu Farha, Ibrahim
%Y Abdelali, Ahmed
%Y Touileb, Samia
%Y Hamed, Injy
%Y Onaizan, Yaser
%Y Alhafni, Bashar
%Y Antoun, Wissam
%Y Khalifa, Salam
%Y Haddad, Hatem
%Y Zitouni, Imed
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Mrini, Khalil
%S Proceedings of The Second Arabic Natural Language Processing Conference
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F alasmary-etal-2024-catt
%X Tashkeel, or Arabic Text Diacritization (ATD), greatly enhances the comprehension of Arabic text by removing ambiguity and minimizing the risk of misinterpretations caused by its absence.It plays a crucial role in improving Arabic text processing, particularly in applications such as text-to-speech and machine translation.This paper introduces a new approach to training ATD models.First, we finetuned two transformers, encoder-only and encoder-decoder, that were initialized from a pretrained character-based BERT.Then, we applied the Noisy-Student approach to boost the performance of the best model.We evaluated our models alongside 11 commercial and open-source models using two manually labeled benchmark datasets: WikiNews and our CATT dataset.Our findings show that our top model surpasses all evaluated models by relative Diacritic Error Rates (DERs) of 30.83% and 35.21% on WikiNews and CATT, respectively, achieving state-of-the-art in ATD.In addition, we show that our model outperforms GPT-4-turbo on CATT dataset by a relative DER of 9.36%.We open-source our CATT models and benchmark dataset for the research community .
%R 10.18653/v1/2024.arabicnlp-1.21
%U https://aclanthology.org/2024.arabicnlp-1.21
%U https://doi.org/10.18653/v1/2024.arabicnlp-1.21
%P 250-257
Markdown (Informal)
[CATT: Character-based Arabic Tashkeel Transformer](https://aclanthology.org/2024.arabicnlp-1.21) (Alasmary et al., ArabicNLP-WS 2024)
ACL
- Faris Alasmary, Orjuwan Zaafarani, and Ahmad Ghannam. 2024. CATT: Character-based Arabic Tashkeel Transformer. In Proceedings of The Second Arabic Natural Language Processing Conference, pages 250–257, Bangkok, Thailand. Association for Computational Linguistics.