@inproceedings{bharati-etal-2026-ajamimorph,
title = "{A}jami{M}orph: Zero-Annotation Morphological Discovery for {H}ausa Ajami via Multi-Method Consensus",
author = "Bharati, Soumedhik and
Mandal, Shibam and
Ghosh, Prithwish and
Ghosh, Swarup Kr and
Mondal, Sayani",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.23/",
pages = "166--171",
abstract = "Hausa Ajami (Hausa written in Arabic script) remains severely under-resourced for computational morphology. We present AjamiMorph, a zero-annotation framework that discovers morphemes through consensus among three unsupervised methods, namely, Byte Pair Encoding (BPE), transition-based boundary detection using Pointwise Mutual Information (PMI), and computational linguistics based Distributional Affix Mining (DAM). Using a Hausa Ajami Bible corpus consisting of 637,414 tokens, AjamiMorph identifies 1,611 high-confidence morphemes, achieving 99.9{\%} coverage. The inventory exhibits a linguistically realistic distribution (66.0{\%} stems, 22.6{\%} suffixes, 11.4{\%} prefixes) and recovers 77.8{\%} of known Hausa affixes. A permutation test that shuffles method assignments (preserving per-method selection sizes) confirms that the observed agreement is above-chance; chi-square remains as a secondary check. A lightweight 5-gram LM comparison (characters vs. consensus morphemes) provides an extrinsic signal. We also report negative results for script-driven Arabic assumptions and LLM-first annotation. This work provides the first unsupervised morpheme inventory for Hausa Ajami and demonstrates consensus as a robust strategy for zero-resource morphology."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bharati-etal-2026-ajamimorph">
<titleInfo>
<title>AjamiMorph: Zero-Annotation Morphological Discovery for Hausa Ajami via Multi-Method Consensus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Soumedhik</namePart>
<namePart type="family">Bharati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shibam</namePart>
<namePart type="family">Mandal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prithwish</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Swarup</namePart>
<namePart type="given">Kr</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sayani</namePart>
<namePart type="family">Mondal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Hausa Ajami (Hausa written in Arabic script) remains severely under-resourced for computational morphology. We present AjamiMorph, a zero-annotation framework that discovers morphemes through consensus among three unsupervised methods, namely, Byte Pair Encoding (BPE), transition-based boundary detection using Pointwise Mutual Information (PMI), and computational linguistics based Distributional Affix Mining (DAM). Using a Hausa Ajami Bible corpus consisting of 637,414 tokens, AjamiMorph identifies 1,611 high-confidence morphemes, achieving 99.9% coverage. The inventory exhibits a linguistically realistic distribution (66.0% stems, 22.6% suffixes, 11.4% prefixes) and recovers 77.8% of known Hausa affixes. A permutation test that shuffles method assignments (preserving per-method selection sizes) confirms that the observed agreement is above-chance; chi-square remains as a secondary check. A lightweight 5-gram LM comparison (characters vs. consensus morphemes) provides an extrinsic signal. We also report negative results for script-driven Arabic assumptions and LLM-first annotation. This work provides the first unsupervised morpheme inventory for Hausa Ajami and demonstrates consensus as a robust strategy for zero-resource morphology.</abstract>
<identifier type="citekey">bharati-etal-2026-ajamimorph</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.23/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>166</start>
<end>171</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AjamiMorph: Zero-Annotation Morphological Discovery for Hausa Ajami via Multi-Method Consensus
%A Bharati, Soumedhik
%A Mandal, Shibam
%A Ghosh, Prithwish
%A Ghosh, Swarup Kr
%A Mondal, Sayani
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F bharati-etal-2026-ajamimorph
%X Hausa Ajami (Hausa written in Arabic script) remains severely under-resourced for computational morphology. We present AjamiMorph, a zero-annotation framework that discovers morphemes through consensus among three unsupervised methods, namely, Byte Pair Encoding (BPE), transition-based boundary detection using Pointwise Mutual Information (PMI), and computational linguistics based Distributional Affix Mining (DAM). Using a Hausa Ajami Bible corpus consisting of 637,414 tokens, AjamiMorph identifies 1,611 high-confidence morphemes, achieving 99.9% coverage. The inventory exhibits a linguistically realistic distribution (66.0% stems, 22.6% suffixes, 11.4% prefixes) and recovers 77.8% of known Hausa affixes. A permutation test that shuffles method assignments (preserving per-method selection sizes) confirms that the observed agreement is above-chance; chi-square remains as a secondary check. A lightweight 5-gram LM comparison (characters vs. consensus morphemes) provides an extrinsic signal. We also report negative results for script-driven Arabic assumptions and LLM-first annotation. This work provides the first unsupervised morpheme inventory for Hausa Ajami and demonstrates consensus as a robust strategy for zero-resource morphology.
%U https://aclanthology.org/2026.abjadnlp-1.23/
%P 166-171
Markdown (Informal)
[AjamiMorph: Zero-Annotation Morphological Discovery for Hausa Ajami via Multi-Method Consensus](https://aclanthology.org/2026.abjadnlp-1.23/) (Bharati et al., AbjadNLP 2026)
ACL