@inproceedings{uemura-etal-2026-afrimteb,
title = "{A}fri{MTEB} and {A}fri{E}5: Benchmarking and Adapting Text Embedding Models for {A}frican Languages",
author = "Uemura, Kosei and
Zhang, Miaoran and
Adelani, David Ifeoluwa",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.171/",
pages = "3697--3717",
ISBN = "979-8-89176-380-7",
abstract = "Text embeddings are an essential building component of several NLP tasks such as retrieval-augmented generation which is crucial for preventing hallucinations in LLMs. Despite the recent release of massively multilingual MTEB (MMTEB), African languages remain underrepresented, with existing tasks often repurposed from translation benchmarks such as FLORES clustering or SIB-200. In this paper, we introduce AfriMTEB{---}a regional expansion of MMTEB covering 59 languages, 14 tasks, and 38 datasets, including six newly added datasets. Unlike many MMTEB datasets that include fewer than five languages, the new additions span 14 to 56 African languages and introduce entirely new tasks, such as hate speech detection, intent detection, and emotion classification, which were not previously covered. Complementing this, we present AfriE5, an adaptation of the instruction-tuned mE5 model to African languages through cross-lingual contrastive distillation. Our evaluation shows that AfriE5 achieves state-of-the-art performance, outperforming strong baselines such as Gemini-Embeddings and mE5."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="uemura-etal-2026-afrimteb">
<titleInfo>
<title>AfriMTEB and AfriE5: Benchmarking and Adapting Text Embedding Models for African Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kosei</namePart>
<namePart type="family">Uemura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miaoran</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">Ifeoluwa</namePart>
<namePart type="family">Adelani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Text embeddings are an essential building component of several NLP tasks such as retrieval-augmented generation which is crucial for preventing hallucinations in LLMs. Despite the recent release of massively multilingual MTEB (MMTEB), African languages remain underrepresented, with existing tasks often repurposed from translation benchmarks such as FLORES clustering or SIB-200. In this paper, we introduce AfriMTEB—a regional expansion of MMTEB covering 59 languages, 14 tasks, and 38 datasets, including six newly added datasets. Unlike many MMTEB datasets that include fewer than five languages, the new additions span 14 to 56 African languages and introduce entirely new tasks, such as hate speech detection, intent detection, and emotion classification, which were not previously covered. Complementing this, we present AfriE5, an adaptation of the instruction-tuned mE5 model to African languages through cross-lingual contrastive distillation. Our evaluation shows that AfriE5 achieves state-of-the-art performance, outperforming strong baselines such as Gemini-Embeddings and mE5.</abstract>
<identifier type="citekey">uemura-etal-2026-afrimteb</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.171/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>3697</start>
<end>3717</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AfriMTEB and AfriE5: Benchmarking and Adapting Text Embedding Models for African Languages
%A Uemura, Kosei
%A Zhang, Miaoran
%A Adelani, David Ifeoluwa
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F uemura-etal-2026-afrimteb
%X Text embeddings are an essential building component of several NLP tasks such as retrieval-augmented generation which is crucial for preventing hallucinations in LLMs. Despite the recent release of massively multilingual MTEB (MMTEB), African languages remain underrepresented, with existing tasks often repurposed from translation benchmarks such as FLORES clustering or SIB-200. In this paper, we introduce AfriMTEB—a regional expansion of MMTEB covering 59 languages, 14 tasks, and 38 datasets, including six newly added datasets. Unlike many MMTEB datasets that include fewer than five languages, the new additions span 14 to 56 African languages and introduce entirely new tasks, such as hate speech detection, intent detection, and emotion classification, which were not previously covered. Complementing this, we present AfriE5, an adaptation of the instruction-tuned mE5 model to African languages through cross-lingual contrastive distillation. Our evaluation shows that AfriE5 achieves state-of-the-art performance, outperforming strong baselines such as Gemini-Embeddings and mE5.
%U https://aclanthology.org/2026.eacl-long.171/
%P 3697-3717
Markdown (Informal)
[AfriMTEB and AfriE5: Benchmarking and Adapting Text Embedding Models for African Languages](https://aclanthology.org/2026.eacl-long.171/) (Uemura et al., EACL 2026)
ACL