@inproceedings{szawerna-etal-2025-annotating,
title = "Annotating Personal Information in {S}wedish Texts with {SPARV}",
author = "Szawerna, Maria Irena and
Alfter, David and
Volodina, Elena",
editor = "Arachchige, Isuri Nanomi and
Frontini, Francesca and
Mitkov, Ruslan and
Rayson, Paul",
booktitle = "Proceedings of the First on Natural Language Processing and Language Models for Digital Humanities",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.lm4dh-1.15/",
pages = "155--163",
abstract = "Digital Humanities (DH) research, among many others, relies on data, a subset of which comes in the form of language data that contains personal information (PI). Working with and sharing such data has ethical and legal implications. The process of removing (anonymization) or replacing (pseudonymization) of personal information in texts may be used to address these issues, and often begins with a PI detection and labeling stage. We present a new tool for personal information detection and labeling for Swedish, SBX-PI-DETECTION (henceforth SBX-PI), alongside a visualization interface, (IM)PERSONAL DATA, which allows for the comparison of outputs from different tools. A valuable feature of SBX-PI is that it enables the users to run the annotation locally. It is also integrated into the text annotation pipeline SPARV, allowing for other types of annotation to be performed simultaneously and contributing to the privacy by design requirement set by the GDPR. A novel feature of (IM)PERSONAL DATA is that it allows researchers to assess the extent of detected PI in a text and how much of it will be manipulated once anonymization or pseudonymization are applied. The tools are primarily aimed at researchers within Digital Humanities and Natural Language Processing and are linked to CLARIN{'}s Virtual Language Observatory."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="szawerna-etal-2025-annotating">
<titleInfo>
<title>Annotating Personal Information in Swedish Texts with SPARV</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Irena</namePart>
<namePart type="family">Szawerna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Alfter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Volodina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First on Natural Language Processing and Language Models for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isuri</namePart>
<namePart type="given">Nanomi</namePart>
<namePart type="family">Arachchige</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesca</namePart>
<namePart type="family">Frontini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Digital Humanities (DH) research, among many others, relies on data, a subset of which comes in the form of language data that contains personal information (PI). Working with and sharing such data has ethical and legal implications. The process of removing (anonymization) or replacing (pseudonymization) of personal information in texts may be used to address these issues, and often begins with a PI detection and labeling stage. We present a new tool for personal information detection and labeling for Swedish, SBX-PI-DETECTION (henceforth SBX-PI), alongside a visualization interface, (IM)PERSONAL DATA, which allows for the comparison of outputs from different tools. A valuable feature of SBX-PI is that it enables the users to run the annotation locally. It is also integrated into the text annotation pipeline SPARV, allowing for other types of annotation to be performed simultaneously and contributing to the privacy by design requirement set by the GDPR. A novel feature of (IM)PERSONAL DATA is that it allows researchers to assess the extent of detected PI in a text and how much of it will be manipulated once anonymization or pseudonymization are applied. The tools are primarily aimed at researchers within Digital Humanities and Natural Language Processing and are linked to CLARIN’s Virtual Language Observatory.</abstract>
<identifier type="citekey">szawerna-etal-2025-annotating</identifier>
<location>
<url>https://aclanthology.org/2025.lm4dh-1.15/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>155</start>
<end>163</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Annotating Personal Information in Swedish Texts with SPARV
%A Szawerna, Maria Irena
%A Alfter, David
%A Volodina, Elena
%Y Arachchige, Isuri Nanomi
%Y Frontini, Francesca
%Y Mitkov, Ruslan
%Y Rayson, Paul
%S Proceedings of the First on Natural Language Processing and Language Models for Digital Humanities
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F szawerna-etal-2025-annotating
%X Digital Humanities (DH) research, among many others, relies on data, a subset of which comes in the form of language data that contains personal information (PI). Working with and sharing such data has ethical and legal implications. The process of removing (anonymization) or replacing (pseudonymization) of personal information in texts may be used to address these issues, and often begins with a PI detection and labeling stage. We present a new tool for personal information detection and labeling for Swedish, SBX-PI-DETECTION (henceforth SBX-PI), alongside a visualization interface, (IM)PERSONAL DATA, which allows for the comparison of outputs from different tools. A valuable feature of SBX-PI is that it enables the users to run the annotation locally. It is also integrated into the text annotation pipeline SPARV, allowing for other types of annotation to be performed simultaneously and contributing to the privacy by design requirement set by the GDPR. A novel feature of (IM)PERSONAL DATA is that it allows researchers to assess the extent of detected PI in a text and how much of it will be manipulated once anonymization or pseudonymization are applied. The tools are primarily aimed at researchers within Digital Humanities and Natural Language Processing and are linked to CLARIN’s Virtual Language Observatory.
%U https://aclanthology.org/2025.lm4dh-1.15/
%P 155-163
Markdown (Informal)
[Annotating Personal Information in Swedish Texts with SPARV](https://aclanthology.org/2025.lm4dh-1.15/) (Szawerna et al., LM4DH 2025)
ACL