@inproceedings{pang-naylor-etal-2025-controllable,
title = "Controllable Clustering with {LLM}-driven Embeddings",
author = "Pang-Naylor, Kerria and
Manivasagan, Shivani and
Zhong, Aitong and
Garg, Mehak and
Mondello, Nicholas and
Buckner, Blake and
Chang, Jonathan P. and
Mahajan, Khyati and
Hashemi, Masoud and
Casati, Fabio",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-industry.48/",
pages = "686--702",
ISBN = "979-8-89176-333-3",
abstract = "Given the inherent subjectivity of similarity in text, fully unsupervised text clustering is unlikely to produce groupings that work across a variety of use cases. Traditional techniques to guide clustering rely on costly, time-consuming human feedback and/or pre-existing labels. Leveraging recent advancements in LLMs and decoder-only embedding models, we present techniques to effectively control text embeddings with minimal human input: prefix instructions and LLM preprocessing. We evaluate clustering performance for datasets with multiple independent ground-truth labels, or perspectives, and find that these techniques can be used to improve clustering for one perspective or use case, at the cost of a tradeoff in performance for another use case."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pang-naylor-etal-2025-controllable">
<titleInfo>
<title>Controllable Clustering with LLM-driven Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kerria</namePart>
<namePart type="family">Pang-Naylor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shivani</namePart>
<namePart type="family">Manivasagan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aitong</namePart>
<namePart type="family">Zhong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehak</namePart>
<namePart type="family">Garg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Mondello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Blake</namePart>
<namePart type="family">Buckner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khyati</namePart>
<namePart type="family">Mahajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masoud</namePart>
<namePart type="family">Hashemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Casati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saloni</namePart>
<namePart type="family">Potdar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lina</namePart>
<namePart type="family">Rojas-Barahona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastien</namePart>
<namePart type="family">Montella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou (China)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-333-3</identifier>
</relatedItem>
<abstract>Given the inherent subjectivity of similarity in text, fully unsupervised text clustering is unlikely to produce groupings that work across a variety of use cases. Traditional techniques to guide clustering rely on costly, time-consuming human feedback and/or pre-existing labels. Leveraging recent advancements in LLMs and decoder-only embedding models, we present techniques to effectively control text embeddings with minimal human input: prefix instructions and LLM preprocessing. We evaluate clustering performance for datasets with multiple independent ground-truth labels, or perspectives, and find that these techniques can be used to improve clustering for one perspective or use case, at the cost of a tradeoff in performance for another use case.</abstract>
<identifier type="citekey">pang-naylor-etal-2025-controllable</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-industry.48/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>686</start>
<end>702</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Controllable Clustering with LLM-driven Embeddings
%A Pang-Naylor, Kerria
%A Manivasagan, Shivani
%A Zhong, Aitong
%A Garg, Mehak
%A Mondello, Nicholas
%A Buckner, Blake
%A Chang, Jonathan P.
%A Mahajan, Khyati
%A Hashemi, Masoud
%A Casati, Fabio
%Y Potdar, Saloni
%Y Rojas-Barahona, Lina
%Y Montella, Sebastien
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou (China)
%@ 979-8-89176-333-3
%F pang-naylor-etal-2025-controllable
%X Given the inherent subjectivity of similarity in text, fully unsupervised text clustering is unlikely to produce groupings that work across a variety of use cases. Traditional techniques to guide clustering rely on costly, time-consuming human feedback and/or pre-existing labels. Leveraging recent advancements in LLMs and decoder-only embedding models, we present techniques to effectively control text embeddings with minimal human input: prefix instructions and LLM preprocessing. We evaluate clustering performance for datasets with multiple independent ground-truth labels, or perspectives, and find that these techniques can be used to improve clustering for one perspective or use case, at the cost of a tradeoff in performance for another use case.
%U https://aclanthology.org/2025.emnlp-industry.48/
%P 686-702
Markdown (Informal)
[Controllable Clustering with LLM-driven Embeddings](https://aclanthology.org/2025.emnlp-industry.48/) (Pang-Naylor et al., EMNLP 2025)
ACL
- Kerria Pang-Naylor, Shivani Manivasagan, Aitong Zhong, Mehak Garg, Nicholas Mondello, Blake Buckner, Jonathan P. Chang, Khyati Mahajan, Masoud Hashemi, and Fabio Casati. 2025. Controllable Clustering with LLM-driven Embeddings. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 686–702, Suzhou (China). Association for Computational Linguistics.