@inproceedings{hammoud-etal-2026-hala,
title = "Hala Technical Report Building {A}rabic-Centric Instruction {\&} Translation Models at Scale",
author = "Hammoud, Hasan Abed Al Kader and
Zbib, Mohamad Bilal and
Ghanem, Bernard",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.32/",
pages = "236--244",
abstract = "We present HALA, a family of Arabic-centric instruction and translation models built with our translate-and-tune pipeline. We first compress a strong AR{\ensuremath{\leftrightarrow}}EN teacher to FP8 (yielding {\textasciitilde}2{\texttimes} higher throughput with no quality loss) and use it to create high-fidelity bilingual supervision. A lightweight language model LFM2{--}1.2B is then fine-tuned on this data and used to translate high-quality English instruction sets into Arabic, producing a million-scale corpus tailored to instruction following. We train HALA models at 350M, 700M, 1.2B, and 9B parameters, and apply slerp merging to balance Arabic specialization with base-model strengths. On Arabic-centric benchmarks, HALA achieves state-of-the-art results within both the ``nano'' ({\ensuremath{\leq}}2B) and ``small'' (7{--}9B) categories, outperforming their bases. We are committed to release models, data, evaluation, and recipes to accelerate research in Arabic NLP."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hammoud-etal-2026-hala">
<titleInfo>
<title>Hala Technical Report Building Arabic-Centric Instruction & Translation Models at Scale</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hasan</namePart>
<namePart type="given">Abed</namePart>
<namePart type="given">Al</namePart>
<namePart type="given">Kader</namePart>
<namePart type="family">Hammoud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamad</namePart>
<namePart type="given">Bilal</namePart>
<namePart type="family">Zbib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bernard</namePart>
<namePart type="family">Ghanem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present HALA, a family of Arabic-centric instruction and translation models built with our translate-and-tune pipeline. We first compress a strong AR\ensuremathłeftrightarrowEN teacher to FP8 (yielding ~2× higher throughput with no quality loss) and use it to create high-fidelity bilingual supervision. A lightweight language model LFM2–1.2B is then fine-tuned on this data and used to translate high-quality English instruction sets into Arabic, producing a million-scale corpus tailored to instruction following. We train HALA models at 350M, 700M, 1.2B, and 9B parameters, and apply slerp merging to balance Arabic specialization with base-model strengths. On Arabic-centric benchmarks, HALA achieves state-of-the-art results within both the “nano” (\ensuremathłeq2B) and “small” (7–9B) categories, outperforming their bases. We are committed to release models, data, evaluation, and recipes to accelerate research in Arabic NLP.</abstract>
<identifier type="citekey">hammoud-etal-2026-hala</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.32/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>236</start>
<end>244</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hala Technical Report Building Arabic-Centric Instruction & Translation Models at Scale
%A Hammoud, Hasan Abed Al Kader
%A Zbib, Mohamad Bilal
%A Ghanem, Bernard
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F hammoud-etal-2026-hala
%X We present HALA, a family of Arabic-centric instruction and translation models built with our translate-and-tune pipeline. We first compress a strong AR\ensuremathłeftrightarrowEN teacher to FP8 (yielding ~2× higher throughput with no quality loss) and use it to create high-fidelity bilingual supervision. A lightweight language model LFM2–1.2B is then fine-tuned on this data and used to translate high-quality English instruction sets into Arabic, producing a million-scale corpus tailored to instruction following. We train HALA models at 350M, 700M, 1.2B, and 9B parameters, and apply slerp merging to balance Arabic specialization with base-model strengths. On Arabic-centric benchmarks, HALA achieves state-of-the-art results within both the “nano” (\ensuremathłeq2B) and “small” (7–9B) categories, outperforming their bases. We are committed to release models, data, evaluation, and recipes to accelerate research in Arabic NLP.
%U https://aclanthology.org/2026.abjadnlp-1.32/
%P 236-244
Markdown (Informal)
[Hala Technical Report Building Arabic-Centric Instruction & Translation Models at Scale](https://aclanthology.org/2026.abjadnlp-1.32/) (Hammoud et al., AbjadNLP 2026)
ACL