@inproceedings{manchanda-shivaswamy-2025-name,
title = "What is in a name? Mitigating Name Bias in Text Embedding Similarity via Anonymization",
author = "Manchanda, Sahil and
Shivaswamy, Pannaga",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.914/",
doi = "10.18653/v1/2025.findings-acl.914",
pages = "17759--17781",
ISBN = "979-8-89176-256-5",
abstract = "Text-embedding models often exhibit biases arising from the data on which they are trained. In this paper, we examine a hitherto unexplored bias in text-embeddings: bias arising from the presence of $\textit{names}$ such as persons, locations, organizations etc. in the text. Our study shows how the presence of $\textit{name-bias}$ in text-embedding models can potentially lead to erroneous conclusions in the assessment of thematic similarity. \textit{Text-embeddings can mistakenly indicate similarity between texts based on names in the text, even when their actual semantic content has no similarity or indicate dissimilarity simply because of the names in the text even when the texts match semantically}. We first demonstrate the presence of name bias in different text-embedding models and then propose $\textit{text-anonymization}$ during inference which involves removing references to names, while preserving the core theme of the text. The efficacy of the anonymization approach is demonstrated on three downstream NLP tasks involving embedding similarities, achieving significant performance gains. Our simple and training-optimization-free approach offers a practical and easily implementable solution to mitigate name bias."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="manchanda-shivaswamy-2025-name">
<titleInfo>
<title>What is in a name? Mitigating Name Bias in Text Embedding Similarity via Anonymization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sahil</namePart>
<namePart type="family">Manchanda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pannaga</namePart>
<namePart type="family">Shivaswamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Text-embedding models often exhibit biases arising from the data on which they are trained. In this paper, we examine a hitherto unexplored bias in text-embeddings: bias arising from the presence of names such as persons, locations, organizations etc. in the text. Our study shows how the presence of name-bias in text-embedding models can potentially lead to erroneous conclusions in the assessment of thematic similarity. Text-embeddings can mistakenly indicate similarity between texts based on names in the text, even when their actual semantic content has no similarity or indicate dissimilarity simply because of the names in the text even when the texts match semantically. We first demonstrate the presence of name bias in different text-embedding models and then propose text-anonymization during inference which involves removing references to names, while preserving the core theme of the text. The efficacy of the anonymization approach is demonstrated on three downstream NLP tasks involving embedding similarities, achieving significant performance gains. Our simple and training-optimization-free approach offers a practical and easily implementable solution to mitigate name bias.</abstract>
<identifier type="citekey">manchanda-shivaswamy-2025-name</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.914</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.914/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>17759</start>
<end>17781</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T What is in a name? Mitigating Name Bias in Text Embedding Similarity via Anonymization
%A Manchanda, Sahil
%A Shivaswamy, Pannaga
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F manchanda-shivaswamy-2025-name
%X Text-embedding models often exhibit biases arising from the data on which they are trained. In this paper, we examine a hitherto unexplored bias in text-embeddings: bias arising from the presence of names such as persons, locations, organizations etc. in the text. Our study shows how the presence of name-bias in text-embedding models can potentially lead to erroneous conclusions in the assessment of thematic similarity. Text-embeddings can mistakenly indicate similarity between texts based on names in the text, even when their actual semantic content has no similarity or indicate dissimilarity simply because of the names in the text even when the texts match semantically. We first demonstrate the presence of name bias in different text-embedding models and then propose text-anonymization during inference which involves removing references to names, while preserving the core theme of the text. The efficacy of the anonymization approach is demonstrated on three downstream NLP tasks involving embedding similarities, achieving significant performance gains. Our simple and training-optimization-free approach offers a practical and easily implementable solution to mitigate name bias.
%R 10.18653/v1/2025.findings-acl.914
%U https://aclanthology.org/2025.findings-acl.914/
%U https://doi.org/10.18653/v1/2025.findings-acl.914
%P 17759-17781
Markdown (Informal)
[What is in a name? Mitigating Name Bias in Text Embedding Similarity via Anonymization](https://aclanthology.org/2025.findings-acl.914/) (Manchanda & Shivaswamy, Findings 2025)
ACL