@inproceedings{poswiata-etal-2026-pl,
title = "{PL}-{MTEB}: {P}olish Massive Text Embedding Benchmark",
author = "Po{\'s}wiata, Rafa{\l} and
Dadas, S{\l}awomir and
Pere{\l}kiewicz, Micha{\l} Wiktor",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1773/",
pages = "35601--35619",
ISBN = "979-8-89176-395-1",
abstract = "In this paper, we introduce the Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in the Polish language. PL-MTEB comprises 30 diverse NLP tasks across five categories: classification, clustering, pair classification, information retrieval, and semantic text similarity. Within the scope of this work, we added 12 new Polish-language tasks to MTEB based on existing datasets and prepared two new datasets used to create four clustering tasks. We evaluated 30 publicly available text embedding models, including Polish and multilingual models. We analyzed the results in detail for specific task types and model sizes. We made the prepared datasets, the source code for evaluation, and the obtained results available to the public at https://github.com/rafalposwiata/pl-mteb."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="poswiata-etal-2026-pl">
<titleInfo>
<title>PL-MTEB: Polish Massive Text Embedding Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rafał</namePart>
<namePart type="family">Poświata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sławomir</namePart>
<namePart type="family">Dadas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michał</namePart>
<namePart type="given">Wiktor</namePart>
<namePart type="family">Perełkiewicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>In this paper, we introduce the Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in the Polish language. PL-MTEB comprises 30 diverse NLP tasks across five categories: classification, clustering, pair classification, information retrieval, and semantic text similarity. Within the scope of this work, we added 12 new Polish-language tasks to MTEB based on existing datasets and prepared two new datasets used to create four clustering tasks. We evaluated 30 publicly available text embedding models, including Polish and multilingual models. We analyzed the results in detail for specific task types and model sizes. We made the prepared datasets, the source code for evaluation, and the obtained results available to the public at https://github.com/rafalposwiata/pl-mteb.</abstract>
<identifier type="citekey">poswiata-etal-2026-pl</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1773/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35601</start>
<end>35619</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PL-MTEB: Polish Massive Text Embedding Benchmark
%A Poświata, Rafał
%A Dadas, Sławomir
%A Perełkiewicz, Michał Wiktor
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F poswiata-etal-2026-pl
%X In this paper, we introduce the Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in the Polish language. PL-MTEB comprises 30 diverse NLP tasks across five categories: classification, clustering, pair classification, information retrieval, and semantic text similarity. Within the scope of this work, we added 12 new Polish-language tasks to MTEB based on existing datasets and prepared two new datasets used to create four clustering tasks. We evaluated 30 publicly available text embedding models, including Polish and multilingual models. We analyzed the results in detail for specific task types and model sizes. We made the prepared datasets, the source code for evaluation, and the obtained results available to the public at https://github.com/rafalposwiata/pl-mteb.
%U https://aclanthology.org/2026.findings-acl.1773/
%P 35601-35619
Markdown (Informal)
[PL-MTEB: Polish Massive Text Embedding Benchmark](https://aclanthology.org/2026.findings-acl.1773/) (Poświata et al., Findings 2026)
ACL
- Rafał Poświata, Sławomir Dadas, and Michał Wiktor Perełkiewicz. 2026. PL-MTEB: Polish Massive Text Embedding Benchmark. In Findings of the Association for Computational Linguistics: ACL 2026, pages 35601–35619, San Diego, California, United States. Association for Computational Linguistics.