@inproceedings{abduljalil-etal-2026-arabic,
title = "{A}rabic-Adapted One-Step Speech-to-Diacritized {ASR}: Evaluation and Error Analysis",
author = "Abduljalil, Osamah A. I. and
Ali, Dalal and
Bajaman, Razan A. and
Alharbi, Abdullah I.",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.43/",
pages = "371--379",
abstract = "Arabic diacritics encode phonetic information essential for pronunciation, disambiguation, and downstream applications, yet most Arabic ASR systems generate undiacritized output. In this work, we study direct speech-to-diacritized-text recognition using a single-stage ASR pipeline that predicts diacritics jointly with Arabic letters, without text-based post-processing. We evaluate two Arabic-adapted ASR architectures{---}wav2vec 2.0 XLSR-53 and Whisper-base{---}under a unified experimental setup on the ClArTTS Classical Arabic dataset. Performance is assessed using surface and lexical WER/CER alongside diacritic error rate (DER) to disentangle base transcription accuracy from diacritic realization. Our results show that Arabic-adapted wav2vec 2.0 achieves substantially lower diacritic error rates than Whisper, indicating stronger exploitation of acoustic cues relevant to vowelization. We further analyze the effect of decoding strategy and provide a detailed breakdown of diacritic errors, highlighting challenges associated with short vowels and morphosyntactic markers. These findings underscore the importance of model architecture and Arabic-specific adaptation for accurate diacritized Arabic ASR."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abduljalil-etal-2026-arabic">
<titleInfo>
<title>Arabic-Adapted One-Step Speech-to-Diacritized ASR: Evaluation and Error Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Osamah</namePart>
<namePart type="given">A</namePart>
<namePart type="given">I</namePart>
<namePart type="family">Abduljalil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dalal</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Razan</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Bajaman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullah</namePart>
<namePart type="given">I</namePart>
<namePart type="family">Alharbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Arabic diacritics encode phonetic information essential for pronunciation, disambiguation, and downstream applications, yet most Arabic ASR systems generate undiacritized output. In this work, we study direct speech-to-diacritized-text recognition using a single-stage ASR pipeline that predicts diacritics jointly with Arabic letters, without text-based post-processing. We evaluate two Arabic-adapted ASR architectures—wav2vec 2.0 XLSR-53 and Whisper-base—under a unified experimental setup on the ClArTTS Classical Arabic dataset. Performance is assessed using surface and lexical WER/CER alongside diacritic error rate (DER) to disentangle base transcription accuracy from diacritic realization. Our results show that Arabic-adapted wav2vec 2.0 achieves substantially lower diacritic error rates than Whisper, indicating stronger exploitation of acoustic cues relevant to vowelization. We further analyze the effect of decoding strategy and provide a detailed breakdown of diacritic errors, highlighting challenges associated with short vowels and morphosyntactic markers. These findings underscore the importance of model architecture and Arabic-specific adaptation for accurate diacritized Arabic ASR.</abstract>
<identifier type="citekey">abduljalil-etal-2026-arabic</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.43/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>371</start>
<end>379</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Arabic-Adapted One-Step Speech-to-Diacritized ASR: Evaluation and Error Analysis
%A Abduljalil, Osamah A. I.
%A Ali, Dalal
%A Bajaman, Razan A.
%A Alharbi, Abdullah I.
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F abduljalil-etal-2026-arabic
%X Arabic diacritics encode phonetic information essential for pronunciation, disambiguation, and downstream applications, yet most Arabic ASR systems generate undiacritized output. In this work, we study direct speech-to-diacritized-text recognition using a single-stage ASR pipeline that predicts diacritics jointly with Arabic letters, without text-based post-processing. We evaluate two Arabic-adapted ASR architectures—wav2vec 2.0 XLSR-53 and Whisper-base—under a unified experimental setup on the ClArTTS Classical Arabic dataset. Performance is assessed using surface and lexical WER/CER alongside diacritic error rate (DER) to disentangle base transcription accuracy from diacritic realization. Our results show that Arabic-adapted wav2vec 2.0 achieves substantially lower diacritic error rates than Whisper, indicating stronger exploitation of acoustic cues relevant to vowelization. We further analyze the effect of decoding strategy and provide a detailed breakdown of diacritic errors, highlighting challenges associated with short vowels and morphosyntactic markers. These findings underscore the importance of model architecture and Arabic-specific adaptation for accurate diacritized Arabic ASR.
%U https://aclanthology.org/2026.abjadnlp-1.43/
%P 371-379
Markdown (Informal)
[Arabic-Adapted One-Step Speech-to-Diacritized ASR: Evaluation and Error Analysis](https://aclanthology.org/2026.abjadnlp-1.43/) (Abduljalil et al., AbjadNLP 2026)
ACL