@inproceedings{choi-etal-2026-k,
title = "K-{L}egal{D}e{ID}: A Benchmark Dataset and {KLUEBERT}-{CRF} for De-identification in {K}orean Court Judgments",
author = "Choi, Wooseok and
Kim, Hyungbin and
Chung, Yon Dohn",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.103/",
pages = "2308--2325",
ISBN = "979-8-89176-380-7",
abstract = "The Korean legal system mandates public access to court judgments to ensure judicial transparency. However, this requirement conflicts with privacy protection obligations due to the prevalence of Personally Identifiable Information (PII) in legal documents. To address this challenge, we introduce **K-LegalDeID**, a large-scale benchmark dataset and an efficient KLUEBERT-CRF model for de-identification for Korean court judgments. Our primary contribution is a new large-scale benchmark dataset spanning 39 legal domains, with its quality is validated by a high inter-annotator agreement (IAA) with Fleiss' Kappa of 0.7352. Our results demonstrate that a lightweight KLUEBERT-CRF model, when trained on our dataset, achieves state-of-the-art performance with an entity-level micro F1 score of 0.9923. Our end-to-end framework offers a practical and computationally efficient solution for real-world legal systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="choi-etal-2026-k">
<titleInfo>
<title>K-LegalDeID: A Benchmark Dataset and KLUEBERT-CRF for De-identification in Korean Court Judgments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wooseok</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyungbin</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yon</namePart>
<namePart type="given">Dohn</namePart>
<namePart type="family">Chung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>The Korean legal system mandates public access to court judgments to ensure judicial transparency. However, this requirement conflicts with privacy protection obligations due to the prevalence of Personally Identifiable Information (PII) in legal documents. To address this challenge, we introduce **K-LegalDeID**, a large-scale benchmark dataset and an efficient KLUEBERT-CRF model for de-identification for Korean court judgments. Our primary contribution is a new large-scale benchmark dataset spanning 39 legal domains, with its quality is validated by a high inter-annotator agreement (IAA) with Fleiss’ Kappa of 0.7352. Our results demonstrate that a lightweight KLUEBERT-CRF model, when trained on our dataset, achieves state-of-the-art performance with an entity-level micro F1 score of 0.9923. Our end-to-end framework offers a practical and computationally efficient solution for real-world legal systems.</abstract>
<identifier type="citekey">choi-etal-2026-k</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.103/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>2308</start>
<end>2325</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T K-LegalDeID: A Benchmark Dataset and KLUEBERT-CRF for De-identification in Korean Court Judgments
%A Choi, Wooseok
%A Kim, Hyungbin
%A Chung, Yon Dohn
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F choi-etal-2026-k
%X The Korean legal system mandates public access to court judgments to ensure judicial transparency. However, this requirement conflicts with privacy protection obligations due to the prevalence of Personally Identifiable Information (PII) in legal documents. To address this challenge, we introduce **K-LegalDeID**, a large-scale benchmark dataset and an efficient KLUEBERT-CRF model for de-identification for Korean court judgments. Our primary contribution is a new large-scale benchmark dataset spanning 39 legal domains, with its quality is validated by a high inter-annotator agreement (IAA) with Fleiss’ Kappa of 0.7352. Our results demonstrate that a lightweight KLUEBERT-CRF model, when trained on our dataset, achieves state-of-the-art performance with an entity-level micro F1 score of 0.9923. Our end-to-end framework offers a practical and computationally efficient solution for real-world legal systems.
%U https://aclanthology.org/2026.eacl-long.103/
%P 2308-2325
Markdown (Informal)
[K-LegalDeID: A Benchmark Dataset and KLUEBERT-CRF for De-identification in Korean Court Judgments](https://aclanthology.org/2026.eacl-long.103/) (Choi et al., EACL 2026)
ACL