@inproceedings{althubaiti-etal-2025-octopus,
title = "Octopus: Towards Building the {A}rabic Speech {LLM} Suite",
author = "Althubaiti, Sara and
Lodagala, Vasista Sai and
Clark, Tjad and
Elshahawy, Yousseif Ahmed and
Izham, Daniel and
Alrajeh, Abdullah and
Bin Tamran, Aljawahrah and
Ali, Ahmed",
editor = "Darwish, Kareem and
Ali, Ahmed and
Abu Farha, Ibrahim and
Touileb, Samia and
Zitouni, Imed and
Abdelali, Ahmed and
Al-Ghamdi, Sharefah and
Alkhereyf, Sakhar and
Zaghouani, Wajdi and
Khalifa, Salam and
AlKhamissi, Badr and
Almatham, Rawan and
Hamed, Injy and
Alyafeai, Zaid and
Alowisheq, Areeb and
Inoue, Go and
Mrini, Khalil and
Alshammari, Waad",
booktitle = "Proceedings of The Third Arabic Natural Language Processing Conference",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.arabicnlp-main.35/",
pages = "425--435",
ISBN = "979-8-89176-352-4",
abstract = "We present Octopus, a first family of modular speech-language models designed for Arabic-English ASR, dialect identification, and speech translation. Built on Whisper-V3 and enhanced with large language models like ALLaM, LLaMA, and DeepSeek, Octopus bridges speech and text through a lightweight projection layer and Q-Former. To broaden its scope beyond speech, Octopus integrates BEATs, a general-purpose audio encoder allowing it to understand both linguistic and acoustic events. Despite its simplicity, this dual-encoder design supports robust performance across multilingual and code-switched scenarios. We also introduce TinyOctopus, a distilled variant using smaller models (Distil-Whisper + LLaMA3-1B / DeepSeek-1.5B), achieving competitive results with just a fraction of the parameters. Fine-tuning on synthetic code-switched data further boosts its performance. Octopus demonstrates the power of compact, extensible architectures in Arabic-centric speech modeling and sets the stage for unified multilingual audio-language understanding."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="althubaiti-etal-2025-octopus">
<titleInfo>
<title>Octopus: Towards Building the Arabic Speech LLM Suite</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Althubaiti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasista</namePart>
<namePart type="given">Sai</namePart>
<namePart type="family">Lodagala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tjad</namePart>
<namePart type="family">Clark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yousseif</namePart>
<namePart type="given">Ahmed</namePart>
<namePart type="family">Elshahawy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Izham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullah</namePart>
<namePart type="family">Alrajeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aljawahrah</namePart>
<namePart type="family">Bin Tamran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The Third Arabic Natural Language Processing Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ibrahim</namePart>
<namePart type="family">Abu Farha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samia</namePart>
<namePart type="family">Touileb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Imed</namePart>
<namePart type="family">Zitouni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharefah</namePart>
<namePart type="family">Al-Ghamdi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakhar</namePart>
<namePart type="family">Alkhereyf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Salam</namePart>
<namePart type="family">Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Badr</namePart>
<namePart type="family">AlKhamissi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rawan</namePart>
<namePart type="family">Almatham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Injy</namePart>
<namePart type="family">Hamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zaid</namePart>
<namePart type="family">Alyafeai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Areeb</namePart>
<namePart type="family">Alowisheq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Go</namePart>
<namePart type="family">Inoue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalil</namePart>
<namePart type="family">Mrini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Waad</namePart>
<namePart type="family">Alshammari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-352-4</identifier>
</relatedItem>
<abstract>We present Octopus, a first family of modular speech-language models designed for Arabic-English ASR, dialect identification, and speech translation. Built on Whisper-V3 and enhanced with large language models like ALLaM, LLaMA, and DeepSeek, Octopus bridges speech and text through a lightweight projection layer and Q-Former. To broaden its scope beyond speech, Octopus integrates BEATs, a general-purpose audio encoder allowing it to understand both linguistic and acoustic events. Despite its simplicity, this dual-encoder design supports robust performance across multilingual and code-switched scenarios. We also introduce TinyOctopus, a distilled variant using smaller models (Distil-Whisper + LLaMA3-1B / DeepSeek-1.5B), achieving competitive results with just a fraction of the parameters. Fine-tuning on synthetic code-switched data further boosts its performance. Octopus demonstrates the power of compact, extensible architectures in Arabic-centric speech modeling and sets the stage for unified multilingual audio-language understanding.</abstract>
<identifier type="citekey">althubaiti-etal-2025-octopus</identifier>
<location>
<url>https://aclanthology.org/2025.arabicnlp-main.35/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>425</start>
<end>435</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Octopus: Towards Building the Arabic Speech LLM Suite
%A Althubaiti, Sara
%A Lodagala, Vasista Sai
%A Clark, Tjad
%A Elshahawy, Yousseif Ahmed
%A Izham, Daniel
%A Alrajeh, Abdullah
%A Bin Tamran, Aljawahrah
%A Ali, Ahmed
%Y Darwish, Kareem
%Y Ali, Ahmed
%Y Abu Farha, Ibrahim
%Y Touileb, Samia
%Y Zitouni, Imed
%Y Abdelali, Ahmed
%Y Al-Ghamdi, Sharefah
%Y Alkhereyf, Sakhar
%Y Zaghouani, Wajdi
%Y Khalifa, Salam
%Y AlKhamissi, Badr
%Y Almatham, Rawan
%Y Hamed, Injy
%Y Alyafeai, Zaid
%Y Alowisheq, Areeb
%Y Inoue, Go
%Y Mrini, Khalil
%Y Alshammari, Waad
%S Proceedings of The Third Arabic Natural Language Processing Conference
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-352-4
%F althubaiti-etal-2025-octopus
%X We present Octopus, a first family of modular speech-language models designed for Arabic-English ASR, dialect identification, and speech translation. Built on Whisper-V3 and enhanced with large language models like ALLaM, LLaMA, and DeepSeek, Octopus bridges speech and text through a lightweight projection layer and Q-Former. To broaden its scope beyond speech, Octopus integrates BEATs, a general-purpose audio encoder allowing it to understand both linguistic and acoustic events. Despite its simplicity, this dual-encoder design supports robust performance across multilingual and code-switched scenarios. We also introduce TinyOctopus, a distilled variant using smaller models (Distil-Whisper + LLaMA3-1B / DeepSeek-1.5B), achieving competitive results with just a fraction of the parameters. Fine-tuning on synthetic code-switched data further boosts its performance. Octopus demonstrates the power of compact, extensible architectures in Arabic-centric speech modeling and sets the stage for unified multilingual audio-language understanding.
%U https://aclanthology.org/2025.arabicnlp-main.35/
%P 425-435
Markdown (Informal)
[Octopus: Towards Building the Arabic Speech LLM Suite](https://aclanthology.org/2025.arabicnlp-main.35/) (Althubaiti et al., ArabicNLP 2025)
ACL
- Sara Althubaiti, Vasista Sai Lodagala, Tjad Clark, Yousseif Ahmed Elshahawy, Daniel Izham, Abdullah Alrajeh, Aljawahrah Bin Tamran, and Ahmed Ali. 2025. Octopus: Towards Building the Arabic Speech LLM Suite. In Proceedings of The Third Arabic Natural Language Processing Conference, pages 425–435, Suzhou, China. Association for Computational Linguistics.