@inproceedings{tsukagoshi-sasano-2025-redundancy,
title = "Redundancy, Isotropy, and Intrinsic Dimensionality of Prompt-based Text Embeddings",
author = "Tsukagoshi, Hayato and
Sasano, Ryohei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1330/",
doi = "10.18653/v1/2025.findings-acl.1330",
pages = "25915--25930",
ISBN = "979-8-89176-256-5",
abstract = "Prompt-based text embedding models, which generate task-specific embeddings upon receiving tailored prompts, have recently demonstrated remarkable performance. However, their resulting embeddings often have thousands of dimensions, leading to high storage costs and increased computational costs of embedding-based operations. In this paper, we investigate how post-hoc dimensionality reduction applied to the embeddings affects the performance of various tasks that leverage these embeddings, specifically classification, clustering, retrieval, and semantic textual similarity (STS) tasks. Our experiments show that even a naive dimensionality reduction, which keeps only the first 25{\%} of the dimensions of the embeddings, results in a very slight performance degradation, indicating that these embeddings are highly redundant. Notably, for classification and clustering, even when embeddings are reduced to less than 0.5{\%} of the original dimensionality the performance degradation is very small. To quantitatively analyze this redundancy, we perform an analysis based on the intrinsic dimensionality and isotropy of the embeddings. Our analysis reveals that embeddings for classification and clustering, which are considered to have very high dimensional redundancy, exhibit lower intrinsic dimensionality and less isotropy compared with those for retrieval and STS."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tsukagoshi-sasano-2025-redundancy">
<titleInfo>
<title>Redundancy, Isotropy, and Intrinsic Dimensionality of Prompt-based Text Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hayato</namePart>
<namePart type="family">Tsukagoshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryohei</namePart>
<namePart type="family">Sasano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Prompt-based text embedding models, which generate task-specific embeddings upon receiving tailored prompts, have recently demonstrated remarkable performance. However, their resulting embeddings often have thousands of dimensions, leading to high storage costs and increased computational costs of embedding-based operations. In this paper, we investigate how post-hoc dimensionality reduction applied to the embeddings affects the performance of various tasks that leverage these embeddings, specifically classification, clustering, retrieval, and semantic textual similarity (STS) tasks. Our experiments show that even a naive dimensionality reduction, which keeps only the first 25% of the dimensions of the embeddings, results in a very slight performance degradation, indicating that these embeddings are highly redundant. Notably, for classification and clustering, even when embeddings are reduced to less than 0.5% of the original dimensionality the performance degradation is very small. To quantitatively analyze this redundancy, we perform an analysis based on the intrinsic dimensionality and isotropy of the embeddings. Our analysis reveals that embeddings for classification and clustering, which are considered to have very high dimensional redundancy, exhibit lower intrinsic dimensionality and less isotropy compared with those for retrieval and STS.</abstract>
<identifier type="citekey">tsukagoshi-sasano-2025-redundancy</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1330</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1330/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>25915</start>
<end>25930</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Redundancy, Isotropy, and Intrinsic Dimensionality of Prompt-based Text Embeddings
%A Tsukagoshi, Hayato
%A Sasano, Ryohei
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F tsukagoshi-sasano-2025-redundancy
%X Prompt-based text embedding models, which generate task-specific embeddings upon receiving tailored prompts, have recently demonstrated remarkable performance. However, their resulting embeddings often have thousands of dimensions, leading to high storage costs and increased computational costs of embedding-based operations. In this paper, we investigate how post-hoc dimensionality reduction applied to the embeddings affects the performance of various tasks that leverage these embeddings, specifically classification, clustering, retrieval, and semantic textual similarity (STS) tasks. Our experiments show that even a naive dimensionality reduction, which keeps only the first 25% of the dimensions of the embeddings, results in a very slight performance degradation, indicating that these embeddings are highly redundant. Notably, for classification and clustering, even when embeddings are reduced to less than 0.5% of the original dimensionality the performance degradation is very small. To quantitatively analyze this redundancy, we perform an analysis based on the intrinsic dimensionality and isotropy of the embeddings. Our analysis reveals that embeddings for classification and clustering, which are considered to have very high dimensional redundancy, exhibit lower intrinsic dimensionality and less isotropy compared with those for retrieval and STS.
%R 10.18653/v1/2025.findings-acl.1330
%U https://aclanthology.org/2025.findings-acl.1330/
%U https://doi.org/10.18653/v1/2025.findings-acl.1330
%P 25915-25930
Markdown (Informal)
[Redundancy, Isotropy, and Intrinsic Dimensionality of Prompt-based Text Embeddings](https://aclanthology.org/2025.findings-acl.1330/) (Tsukagoshi & Sasano, Findings 2025)
ACL