@inproceedings{wang-etal-2024-bpid,
title = "{BPID}: A Benchmark for Personal Identity Deduplication",
author = "Wang, Runhui and
Tao, Yefan and
Krishnan, Adit and
Kong, Luyang and
Liu, Xuanqing and
Deng, Yuqian and
Yang, Yunzhao and
Johnson, Henrik and
Borthwick, Andrew and
Gupta, Shobhit and
Gundlapalli, Aditi Sinha and
Golac, Davor",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.40",
pages = "538--546",
abstract = "Data deduplication is a critical task in data management and mining, focused on consolidating duplicate records that refer to the same entity. Personally Identifiable Information (PII) is a critical class of data for deduplication across various industries. Consumer data, stored and generated through various engagement channels, is crucial for marketers, agencies, and publishers. However, a major challenge to PII data deduplication is the lack of open-source benchmark datasets due to stringent privacy concerns, which hinders the research, development, and evaluation of robust solutions.This paper addresses this critical lack of PII deduplication benchmarks by introducing the first open-source, high-quality dataset for this task. We provide two datasets: one with 1,000,000 unlabeled synthetic PII profiles and a subset of 10,000 pairs curated and labeled by trained annotators as matches or non-matches. Our datasets contain synthetic profiles built from publicly available sources that do not represent any real individuals, thus ensuring privacy and ethical compliance. We provide several challenging data variations to evaluate the effectiveness of various deduplication techniques, including traditional supervised methods, deep-learning approaches, and large language models (LLMs). Our work aims to set a new standard for PII deduplication, paving the way for more accurate and secure solutions. We share our data publicly at this link - https://zenodo.org/records/13932202.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2024-bpid">
<titleInfo>
<title>BPID: A Benchmark for Personal Identity Deduplication</title>
</titleInfo>
<name type="personal">
<namePart type="given">Runhui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yefan</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adit</namePart>
<namePart type="family">Krishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luyang</namePart>
<namePart type="family">Kong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanqing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuqian</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunzhao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henrik</namePart>
<namePart type="family">Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="family">Borthwick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shobhit</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditi</namePart>
<namePart type="given">Sinha</namePart>
<namePart type="family">Gundlapalli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Davor</namePart>
<namePart type="family">Golac</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data deduplication is a critical task in data management and mining, focused on consolidating duplicate records that refer to the same entity. Personally Identifiable Information (PII) is a critical class of data for deduplication across various industries. Consumer data, stored and generated through various engagement channels, is crucial for marketers, agencies, and publishers. However, a major challenge to PII data deduplication is the lack of open-source benchmark datasets due to stringent privacy concerns, which hinders the research, development, and evaluation of robust solutions.This paper addresses this critical lack of PII deduplication benchmarks by introducing the first open-source, high-quality dataset for this task. We provide two datasets: one with 1,000,000 unlabeled synthetic PII profiles and a subset of 10,000 pairs curated and labeled by trained annotators as matches or non-matches. Our datasets contain synthetic profiles built from publicly available sources that do not represent any real individuals, thus ensuring privacy and ethical compliance. We provide several challenging data variations to evaluate the effectiveness of various deduplication techniques, including traditional supervised methods, deep-learning approaches, and large language models (LLMs). Our work aims to set a new standard for PII deduplication, paving the way for more accurate and secure solutions. We share our data publicly at this link - https://zenodo.org/records/13932202.</abstract>
<identifier type="citekey">wang-etal-2024-bpid</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.40</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>538</start>
<end>546</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BPID: A Benchmark for Personal Identity Deduplication
%A Wang, Runhui
%A Tao, Yefan
%A Krishnan, Adit
%A Kong, Luyang
%A Liu, Xuanqing
%A Deng, Yuqian
%A Yang, Yunzhao
%A Johnson, Henrik
%A Borthwick, Andrew
%A Gupta, Shobhit
%A Gundlapalli, Aditi Sinha
%A Golac, Davor
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F wang-etal-2024-bpid
%X Data deduplication is a critical task in data management and mining, focused on consolidating duplicate records that refer to the same entity. Personally Identifiable Information (PII) is a critical class of data for deduplication across various industries. Consumer data, stored and generated through various engagement channels, is crucial for marketers, agencies, and publishers. However, a major challenge to PII data deduplication is the lack of open-source benchmark datasets due to stringent privacy concerns, which hinders the research, development, and evaluation of robust solutions.This paper addresses this critical lack of PII deduplication benchmarks by introducing the first open-source, high-quality dataset for this task. We provide two datasets: one with 1,000,000 unlabeled synthetic PII profiles and a subset of 10,000 pairs curated and labeled by trained annotators as matches or non-matches. Our datasets contain synthetic profiles built from publicly available sources that do not represent any real individuals, thus ensuring privacy and ethical compliance. We provide several challenging data variations to evaluate the effectiveness of various deduplication techniques, including traditional supervised methods, deep-learning approaches, and large language models (LLMs). Our work aims to set a new standard for PII deduplication, paving the way for more accurate and secure solutions. We share our data publicly at this link - https://zenodo.org/records/13932202.
%U https://aclanthology.org/2024.emnlp-industry.40
%P 538-546
Markdown (Informal)
[BPID: A Benchmark for Personal Identity Deduplication](https://aclanthology.org/2024.emnlp-industry.40) (Wang et al., EMNLP 2024)
ACL
- Runhui Wang, Yefan Tao, Adit Krishnan, Luyang Kong, Xuanqing Liu, Yuqian Deng, Yunzhao Yang, Henrik Johnson, Andrew Borthwick, Shobhit Gupta, Aditi Sinha Gundlapalli, and Davor Golac. 2024. BPID: A Benchmark for Personal Identity Deduplication. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 538–546, Miami, Florida, US. Association for Computational Linguistics.