@inproceedings{al-khadhuri-etal-2026-oman,
title = "{OMAN}-{SPEECH}: A Multi-Layer Annotated Speech Corpus for {O}mani {A}rabic Dialects",
author = "Al Khadhuri, Rayyan S. and
Al Mahrouqi, Firas and
Al Mandhari, Salim and
Al-Kathiri, Amir Azad and
Alshahri, Omar Said and
Alsaqr, Ghassab Mansoor and
Mudhsh, Badri Abdulhakim and
Fatnassi, Tarek",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.31/",
pages = "229--235",
abstract = "Automatic Speech Recognition (ASR) has achieved strong performance in high-resource languages; however, Dialectal Arabic remains significantly under-resourced. This gap is particularly evident in Oman, where Arabic exhibits substantial sociolinguistic variation shaped by settlement patterns between sedentary (Hadari) and nomadic (Badu) communities, which are often overlooked by urban-centric or generalized Gulf Arabic datasets. We introduce OMAN-SPEECH, a sociolinguistically stratified spoken corpus for Omani Arabic comprising approximately 40 hours of spontaneous and semi-spontaneous speech from 32 speakers across 11 Wilayats (provinces). The corpus is balanced to capture regional and lifestyle variation and is annotated at the sentence level with Arabic transcription, English translation, and phonetic transcription using the International Phonetic Alphabet (IPA) through a human-in-the-loop annotation pipeline. OMAN-SPEECH provides a foundational resource for evaluating ASR and related speech technologies on Omani and Gulf Arabic varieties and supports more granular modeling of regional dialectal variation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="al-khadhuri-etal-2026-oman">
<titleInfo>
<title>OMAN-SPEECH: A Multi-Layer Annotated Speech Corpus for Omani Arabic Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rayyan</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Al Khadhuri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firas</namePart>
<namePart type="family">Al Mahrouqi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salim</namePart>
<namePart type="family">Al Mandhari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="given">Azad</namePart>
<namePart type="family">Al-Kathiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omar</namePart>
<namePart type="given">Said</namePart>
<namePart type="family">Alshahri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ghassab</namePart>
<namePart type="given">Mansoor</namePart>
<namePart type="family">Alsaqr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badri</namePart>
<namePart type="given">Abdulhakim</namePart>
<namePart type="family">Mudhsh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tarek</namePart>
<namePart type="family">Fatnassi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) has achieved strong performance in high-resource languages; however, Dialectal Arabic remains significantly under-resourced. This gap is particularly evident in Oman, where Arabic exhibits substantial sociolinguistic variation shaped by settlement patterns between sedentary (Hadari) and nomadic (Badu) communities, which are often overlooked by urban-centric or generalized Gulf Arabic datasets. We introduce OMAN-SPEECH, a sociolinguistically stratified spoken corpus for Omani Arabic comprising approximately 40 hours of spontaneous and semi-spontaneous speech from 32 speakers across 11 Wilayats (provinces). The corpus is balanced to capture regional and lifestyle variation and is annotated at the sentence level with Arabic transcription, English translation, and phonetic transcription using the International Phonetic Alphabet (IPA) through a human-in-the-loop annotation pipeline. OMAN-SPEECH provides a foundational resource for evaluating ASR and related speech technologies on Omani and Gulf Arabic varieties and supports more granular modeling of regional dialectal variation.</abstract>
<identifier type="citekey">al-khadhuri-etal-2026-oman</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.31/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>229</start>
<end>235</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OMAN-SPEECH: A Multi-Layer Annotated Speech Corpus for Omani Arabic Dialects
%A Al Khadhuri, Rayyan S.
%A Al Mahrouqi, Firas
%A Al Mandhari, Salim
%A Al-Kathiri, Amir Azad
%A Alshahri, Omar Said
%A Alsaqr, Ghassab Mansoor
%A Mudhsh, Badri Abdulhakim
%A Fatnassi, Tarek
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F al-khadhuri-etal-2026-oman
%X Automatic Speech Recognition (ASR) has achieved strong performance in high-resource languages; however, Dialectal Arabic remains significantly under-resourced. This gap is particularly evident in Oman, where Arabic exhibits substantial sociolinguistic variation shaped by settlement patterns between sedentary (Hadari) and nomadic (Badu) communities, which are often overlooked by urban-centric or generalized Gulf Arabic datasets. We introduce OMAN-SPEECH, a sociolinguistically stratified spoken corpus for Omani Arabic comprising approximately 40 hours of spontaneous and semi-spontaneous speech from 32 speakers across 11 Wilayats (provinces). The corpus is balanced to capture regional and lifestyle variation and is annotated at the sentence level with Arabic transcription, English translation, and phonetic transcription using the International Phonetic Alphabet (IPA) through a human-in-the-loop annotation pipeline. OMAN-SPEECH provides a foundational resource for evaluating ASR and related speech technologies on Omani and Gulf Arabic varieties and supports more granular modeling of regional dialectal variation.
%U https://aclanthology.org/2026.abjadnlp-1.31/
%P 229-235
Markdown (Informal)
[OMAN-SPEECH: A Multi-Layer Annotated Speech Corpus for Omani Arabic Dialects](https://aclanthology.org/2026.abjadnlp-1.31/) (Al Khadhuri et al., AbjadNLP 2026)
ACL
- Rayyan S. Al Khadhuri, Firas Al Mahrouqi, Salim Al Mandhari, Amir Azad Al-Kathiri, Omar Said Alshahri, Ghassab Mansoor Alsaqr, Badri Abdulhakim Mudhsh, and Tarek Fatnassi. 2026. OMAN-SPEECH: A Multi-Layer Annotated Speech Corpus for Omani Arabic Dialects. In Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script, pages 229–235, Rabat, Morocco. Association for Computational Linguistics.