@inproceedings{gedi-etal-2026-morphologically,
title = "Morphologically-informed {S}omali Lemmatization Corpus built with a Web-based Crowdsourcing Platform",
author = "Gedi, Abdifatah Ahmed and
Mohamed, Shafie Abdi and
Yusuf, Yusuf A. and
Mohamed, Muhidin A. and
Hassan, Fuad Mire and
Assowe, Houssein A",
editor = "Chimoto, Everlyn Asiko and
Lignos, Constantine and
Muhammad, Shamsuddeen and
Abdulmumin, Idris and
Siro, Clemencia and
Adelani, David Ifeoluwa",
booktitle = "Proceedings of the 7th Workshop on {A}frican Natural Language Processing ({A}frica{NLP} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.africanlp-main.17/",
pages = "179--189",
ISBN = "979-8-89176-364-7",
abstract = "Lemmatization, which reduces words to their root forms, plays a key role in tasks such as information retrieval, text indexing, and machinelearning-based language models. However, a key research challenge for low-resourced languages such as the Somali is the lack of human-annotated lemmatization datasets and reliable ground truth to underpin accurate morphological analysis and training relevant NLP models. To address this problem, we developed the first large-scale, purpose-built Somali lemmatization lexicon, coupled with a crowdsourcing platform for ongoing expansion. The system leverages Somali{'}s agglutinative and derivational morphology, encompassing over5,584 root words and 78,629 derivative forms, each annotated with part-of-speech tags. For data validation purpose, we have devised a pilot lexicon-based lemmatizer integrated with rule-based logic to handle out-of-vocabulary terms. Evaluation on a 294-document corpuscovering news articles, social media posts, and short messages shows lemmatization accuracies of 51.27{\%} for full articles, 44.14{\%} forexcerpts, and 59.51{\%} for short texts such as tweets. These results demonstrate that combining lexical resources, POS tagging, and rulebased strategies provides a robust and scalable framework for addressing morphological complexity in Somali and other low-resource languages"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gedi-etal-2026-morphologically">
<titleInfo>
<title>Morphologically-informed Somali Lemmatization Corpus built with a Web-based Crowdsourcing Platform</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abdifatah</namePart>
<namePart type="given">Ahmed</namePart>
<namePart type="family">Gedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shafie</namePart>
<namePart type="given">Abdi</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuf</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Yusuf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhidin</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Mohamed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fuad</namePart>
<namePart type="given">Mire</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houssein</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Assowe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Everlyn</namePart>
<namePart type="given">Asiko</namePart>
<namePart type="family">Chimoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Constantine</namePart>
<namePart type="family">Lignos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shamsuddeen</namePart>
<namePart type="family">Muhammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Idris</namePart>
<namePart type="family">Abdulmumin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clemencia</namePart>
<namePart type="family">Siro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-364-7</identifier>
</relatedItem>
<abstract>Lemmatization, which reduces words to their root forms, plays a key role in tasks such as information retrieval, text indexing, and machinelearning-based language models. However, a key research challenge for low-resourced languages such as the Somali is the lack of human-annotated lemmatization datasets and reliable ground truth to underpin accurate morphological analysis and training relevant NLP models. To address this problem, we developed the first large-scale, purpose-built Somali lemmatization lexicon, coupled with a crowdsourcing platform for ongoing expansion. The system leverages Somali’s agglutinative and derivational morphology, encompassing over5,584 root words and 78,629 derivative forms, each annotated with part-of-speech tags. For data validation purpose, we have devised a pilot lexicon-based lemmatizer integrated with rule-based logic to handle out-of-vocabulary terms. Evaluation on a 294-document corpuscovering news articles, social media posts, and short messages shows lemmatization accuracies of 51.27% for full articles, 44.14% forexcerpts, and 59.51% for short texts such as tweets. These results demonstrate that combining lexical resources, POS tagging, and rulebased strategies provides a robust and scalable framework for addressing morphological complexity in Somali and other low-resource languages</abstract>
<identifier type="citekey">gedi-etal-2026-morphologically</identifier>
<location>
<url>https://aclanthology.org/2026.africanlp-main.17/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>179</start>
<end>189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Morphologically-informed Somali Lemmatization Corpus built with a Web-based Crowdsourcing Platform
%A Gedi, Abdifatah Ahmed
%A Mohamed, Shafie Abdi
%A Yusuf, Yusuf A.
%A Mohamed, Muhidin A.
%A Hassan, Fuad Mire
%A Assowe, Houssein A.
%Y Chimoto, Everlyn Asiko
%Y Lignos, Constantine
%Y Muhammad, Shamsuddeen
%Y Abdulmumin, Idris
%Y Siro, Clemencia
%Y Adelani, David Ifeoluwa
%S Proceedings of the 7th Workshop on African Natural Language Processing (AfricaNLP 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-364-7
%F gedi-etal-2026-morphologically
%X Lemmatization, which reduces words to their root forms, plays a key role in tasks such as information retrieval, text indexing, and machinelearning-based language models. However, a key research challenge for low-resourced languages such as the Somali is the lack of human-annotated lemmatization datasets and reliable ground truth to underpin accurate morphological analysis and training relevant NLP models. To address this problem, we developed the first large-scale, purpose-built Somali lemmatization lexicon, coupled with a crowdsourcing platform for ongoing expansion. The system leverages Somali’s agglutinative and derivational morphology, encompassing over5,584 root words and 78,629 derivative forms, each annotated with part-of-speech tags. For data validation purpose, we have devised a pilot lexicon-based lemmatizer integrated with rule-based logic to handle out-of-vocabulary terms. Evaluation on a 294-document corpuscovering news articles, social media posts, and short messages shows lemmatization accuracies of 51.27% for full articles, 44.14% forexcerpts, and 59.51% for short texts such as tweets. These results demonstrate that combining lexical resources, POS tagging, and rulebased strategies provides a robust and scalable framework for addressing morphological complexity in Somali and other low-resource languages
%U https://aclanthology.org/2026.africanlp-main.17/
%P 179-189
Markdown (Informal)
[Morphologically-informed Somali Lemmatization Corpus built with a Web-based Crowdsourcing Platform](https://aclanthology.org/2026.africanlp-main.17/) (Gedi et al., AfricaNLP 2026)
ACL