@inproceedings{nyffenegger-etal-2024-anonymity,
title = "Anonymity at Risk? Assessing Re-Identification Capabilities of Large Language Models in Court Decisions",
author = {Nyffenegger, Alex and
St{\"u}rmer, Matthias and
Niklaus, Joel},
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.157",
doi = "10.18653/v1/2024.findings-naacl.157",
pages = "2433--2462",
abstract = "Anonymity in court rulings is a critical aspect of privacy protection in the European Union and Switzerland but with the advent of LLMs, concerns about large-scale re-identification of anonymized persons are growing. In accordance with the Federal Supreme Court of Switzerland (FSCS), we study re-identification risks using actual legal data. Following the initial experiment, we constructed an anonymized Wikipedia dataset as a more rigorous testing ground to further investigate the findings. In addition to the datasets, we also introduce new metrics to measure performance. We systematically analyze the factors that influence successful re-identifications, identifying model size, input length, and instruction tuning among the most critical determinants. Despite high re-identification rates on Wikipedia, even the best LLMs struggled with court decisions. We demonstrate that for now, the risk of re-identifications using LLMs is minimal in the vast majority of cases. We hope that our system can help enhance the confidence in the security of anonymized decisions, thus leading the courts to publish more decisions.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nyffenegger-etal-2024-anonymity">
<titleInfo>
<title>Anonymity at Risk? Assessing Re-Identification Capabilities of Large Language Models in Court Decisions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Nyffenegger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthias</namePart>
<namePart type="family">Stürmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Niklaus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Anonymity in court rulings is a critical aspect of privacy protection in the European Union and Switzerland but with the advent of LLMs, concerns about large-scale re-identification of anonymized persons are growing. In accordance with the Federal Supreme Court of Switzerland (FSCS), we study re-identification risks using actual legal data. Following the initial experiment, we constructed an anonymized Wikipedia dataset as a more rigorous testing ground to further investigate the findings. In addition to the datasets, we also introduce new metrics to measure performance. We systematically analyze the factors that influence successful re-identifications, identifying model size, input length, and instruction tuning among the most critical determinants. Despite high re-identification rates on Wikipedia, even the best LLMs struggled with court decisions. We demonstrate that for now, the risk of re-identifications using LLMs is minimal in the vast majority of cases. We hope that our system can help enhance the confidence in the security of anonymized decisions, thus leading the courts to publish more decisions.</abstract>
<identifier type="citekey">nyffenegger-etal-2024-anonymity</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.157</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.157</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>2433</start>
<end>2462</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Anonymity at Risk? Assessing Re-Identification Capabilities of Large Language Models in Court Decisions
%A Nyffenegger, Alex
%A Stürmer, Matthias
%A Niklaus, Joel
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F nyffenegger-etal-2024-anonymity
%X Anonymity in court rulings is a critical aspect of privacy protection in the European Union and Switzerland but with the advent of LLMs, concerns about large-scale re-identification of anonymized persons are growing. In accordance with the Federal Supreme Court of Switzerland (FSCS), we study re-identification risks using actual legal data. Following the initial experiment, we constructed an anonymized Wikipedia dataset as a more rigorous testing ground to further investigate the findings. In addition to the datasets, we also introduce new metrics to measure performance. We systematically analyze the factors that influence successful re-identifications, identifying model size, input length, and instruction tuning among the most critical determinants. Despite high re-identification rates on Wikipedia, even the best LLMs struggled with court decisions. We demonstrate that for now, the risk of re-identifications using LLMs is minimal in the vast majority of cases. We hope that our system can help enhance the confidence in the security of anonymized decisions, thus leading the courts to publish more decisions.
%R 10.18653/v1/2024.findings-naacl.157
%U https://aclanthology.org/2024.findings-naacl.157
%U https://doi.org/10.18653/v1/2024.findings-naacl.157
%P 2433-2462
Markdown (Informal)
[Anonymity at Risk? Assessing Re-Identification Capabilities of Large Language Models in Court Decisions](https://aclanthology.org/2024.findings-naacl.157) (Nyffenegger et al., Findings 2024)
ACL