@inproceedings{tamber-etal-2025-cant,
title = "Can{'}t Hide Behind the {API}: Stealing Black-Box Commercial Embedding Models",
author = "Tamber, Manveer Singh and
Xian, Jasper and
Lin, Jimmy",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-naacl.104/",
doi = "10.18653/v1/2025.findings-naacl.104",
pages = "1958--1969",
ISBN = "979-8-89176-195-7",
abstract = "Embedding models that generate dense vector representations of text are widely used and hold significant commercial value. Companies such as OpenAI and Cohere offer proprietary embedding models via paid APIs, but despite being ``hidden'' behind APIs, these models are not protected from theft. We present, to our knowledge, the first effort to ``steal'' these models for retrieval by training thief models on text{--}embedding pairs obtained from the APIs. Our experiments demonstrate that it is possible to replicate the retrieval effectiveness of commercial embedding models with a cost of under {\$}300. Notably, our methods allow for distilling from multiple teachers into a single robust student model, and for distilling into presumably smaller models with fewer dimension vectors, yet competitive retrieval effectiveness. Our findings raise important considerations for deploying commercial embedding models and suggest measures to mitigate the risk of model theft."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tamber-etal-2025-cant">
<titleInfo>
<title>Can’t Hide Behind the API: Stealing Black-Box Commercial Embedding Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manveer</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Tamber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jasper</namePart>
<namePart type="family">Xian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimmy</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-195-7</identifier>
</relatedItem>
<abstract>Embedding models that generate dense vector representations of text are widely used and hold significant commercial value. Companies such as OpenAI and Cohere offer proprietary embedding models via paid APIs, but despite being “hidden” behind APIs, these models are not protected from theft. We present, to our knowledge, the first effort to “steal” these models for retrieval by training thief models on text–embedding pairs obtained from the APIs. Our experiments demonstrate that it is possible to replicate the retrieval effectiveness of commercial embedding models with a cost of under $300. Notably, our methods allow for distilling from multiple teachers into a single robust student model, and for distilling into presumably smaller models with fewer dimension vectors, yet competitive retrieval effectiveness. Our findings raise important considerations for deploying commercial embedding models and suggest measures to mitigate the risk of model theft.</abstract>
<identifier type="citekey">tamber-etal-2025-cant</identifier>
<identifier type="doi">10.18653/v1/2025.findings-naacl.104</identifier>
<location>
<url>https://aclanthology.org/2025.findings-naacl.104/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>1958</start>
<end>1969</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can’t Hide Behind the API: Stealing Black-Box Commercial Embedding Models
%A Tamber, Manveer Singh
%A Xian, Jasper
%A Lin, Jimmy
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Findings of the Association for Computational Linguistics: NAACL 2025
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-195-7
%F tamber-etal-2025-cant
%X Embedding models that generate dense vector representations of text are widely used and hold significant commercial value. Companies such as OpenAI and Cohere offer proprietary embedding models via paid APIs, but despite being “hidden” behind APIs, these models are not protected from theft. We present, to our knowledge, the first effort to “steal” these models for retrieval by training thief models on text–embedding pairs obtained from the APIs. Our experiments demonstrate that it is possible to replicate the retrieval effectiveness of commercial embedding models with a cost of under $300. Notably, our methods allow for distilling from multiple teachers into a single robust student model, and for distilling into presumably smaller models with fewer dimension vectors, yet competitive retrieval effectiveness. Our findings raise important considerations for deploying commercial embedding models and suggest measures to mitigate the risk of model theft.
%R 10.18653/v1/2025.findings-naacl.104
%U https://aclanthology.org/2025.findings-naacl.104/
%U https://doi.org/10.18653/v1/2025.findings-naacl.104
%P 1958-1969
Markdown (Informal)
[Can’t Hide Behind the API: Stealing Black-Box Commercial Embedding Models](https://aclanthology.org/2025.findings-naacl.104/) (Tamber et al., Findings 2025)
ACL