@inproceedings{stahel-etal-2025-bit,
title = "A Bit of This, a Bit of That: Building a Genre and Topic Annotated Dataset of Historical Newspaper Articles with Soft Labels and Confidence Scores",
author = "Stahel, Karin and
How, Irenie and
Millar, Lauren and
Paterson, Luis and
Steel, Daniel and
Middendorf, Kaspar",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.nlp4dh-1.33/",
doi = "10.18653/v1/2025.nlp4dh-1.33",
pages = "377--392",
ISBN = "979-8-89176-234-3",
abstract = "Digitised historical newspaper collections are becoming increasingly accessible, yet their scale and diverse content still present challenges for researchers interested in specific article types or topics. In a step towards developing models to address these challenges, we have created a dataset of articles from New Zealand{'}s Papers Past open data annotated with multiple genre and topic labels and annotator confidence scores. Our annotation framework aligns with the perspectivist approach to machine learning, acknowledging the subjective nature of the task and embracing the hybridity and uncertainty of genres. In this paper, we describe our sampling and annotation methods and the resulting dataset of 7,036 articles from 106 New Zealand newspapers spanning the period 1839-1903. This dataset will be used to develop interpretable classification models that enable fine-grained exploration and discovery of articles in Papers Past newspapers based on common aspects of form, function, and topic. The complete dataset, including un-aggregated annotations and supporting documentation, will eventually be openly released to facilitate further research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="stahel-etal-2025-bit">
<titleInfo>
<title>A Bit of This, a Bit of That: Building a Genre and Topic Annotated Dataset of Historical Newspaper Articles with Soft Labels and Confidence Scores</title>
</titleInfo>
<name type="personal">
<namePart type="given">Karin</namePart>
<namePart type="family">Stahel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irenie</namePart>
<namePart type="family">How</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lauren</namePart>
<namePart type="family">Millar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Paterson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Steel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaspar</namePart>
<namePart type="family">Middendorf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-234-3</identifier>
</relatedItem>
<abstract>Digitised historical newspaper collections are becoming increasingly accessible, yet their scale and diverse content still present challenges for researchers interested in specific article types or topics. In a step towards developing models to address these challenges, we have created a dataset of articles from New Zealand’s Papers Past open data annotated with multiple genre and topic labels and annotator confidence scores. Our annotation framework aligns with the perspectivist approach to machine learning, acknowledging the subjective nature of the task and embracing the hybridity and uncertainty of genres. In this paper, we describe our sampling and annotation methods and the resulting dataset of 7,036 articles from 106 New Zealand newspapers spanning the period 1839-1903. This dataset will be used to develop interpretable classification models that enable fine-grained exploration and discovery of articles in Papers Past newspapers based on common aspects of form, function, and topic. The complete dataset, including un-aggregated annotations and supporting documentation, will eventually be openly released to facilitate further research.</abstract>
<identifier type="citekey">stahel-etal-2025-bit</identifier>
<identifier type="doi">10.18653/v1/2025.nlp4dh-1.33</identifier>
<location>
<url>https://aclanthology.org/2025.nlp4dh-1.33/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>377</start>
<end>392</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Bit of This, a Bit of That: Building a Genre and Topic Annotated Dataset of Historical Newspaper Articles with Soft Labels and Confidence Scores
%A Stahel, Karin
%A How, Irenie
%A Millar, Lauren
%A Paterson, Luis
%A Steel, Daniel
%A Middendorf, Kaspar
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Bizzoni, Yuri
%Y Miyagawa, So
%Y Alnajjar, Khalid
%S Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, USA
%@ 979-8-89176-234-3
%F stahel-etal-2025-bit
%X Digitised historical newspaper collections are becoming increasingly accessible, yet their scale and diverse content still present challenges for researchers interested in specific article types or topics. In a step towards developing models to address these challenges, we have created a dataset of articles from New Zealand’s Papers Past open data annotated with multiple genre and topic labels and annotator confidence scores. Our annotation framework aligns with the perspectivist approach to machine learning, acknowledging the subjective nature of the task and embracing the hybridity and uncertainty of genres. In this paper, we describe our sampling and annotation methods and the resulting dataset of 7,036 articles from 106 New Zealand newspapers spanning the period 1839-1903. This dataset will be used to develop interpretable classification models that enable fine-grained exploration and discovery of articles in Papers Past newspapers based on common aspects of form, function, and topic. The complete dataset, including un-aggregated annotations and supporting documentation, will eventually be openly released to facilitate further research.
%R 10.18653/v1/2025.nlp4dh-1.33
%U https://aclanthology.org/2025.nlp4dh-1.33/
%U https://doi.org/10.18653/v1/2025.nlp4dh-1.33
%P 377-392
Markdown (Informal)
[A Bit of This, a Bit of That: Building a Genre and Topic Annotated Dataset of Historical Newspaper Articles with Soft Labels and Confidence Scores](https://aclanthology.org/2025.nlp4dh-1.33/) (Stahel et al., NLP4DH 2025)
ACL