@inproceedings{henriksson-etal-2026-register,
title = "Register Mixing Is the Norm on the Web",
author = "Henriksson, Erik and
Razzaghi, Alireza and
Lundberg, Tuomas and
Kanner, Antti and
Laippala, Veronika",
editor = {Hamilton, Sil and
{\"O}hman, Emily and
Hicke, Rebecca M. M. and
Bizzoni, Yuri and
Bax, Axel and
Matthews, Jacob A. and
H{\"a}m{\"a}l{\"a}inen, Mika},
booktitle = "Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.nlp4dh-1.14/",
pages = "138--149",
ISBN = "979-8-89176-427-9",
abstract = "Nearly all studies on web registers{---}online text varieties associated with characteristic social contexts and linguistic features{---}use full documents as the unit of analysis. However, web documents often contain sections in different registers. A cooking blog, for instance, may combine personal storytelling, recipe instructions, user comments, and promotional text within a single URL. This internal variation raises doubts about the validity of document level register labeling. In this paper, we propose an LLM-based approach that identifies register homogeneous segments within documents and apply it to a 10,000-document English sample from HPLT 3.0. We show that segmentation addresses persistent problems in register analysis, including low inter-annotator agreement and category fuzziness. Strikingly, it also reveals that most web documents contain more than one register, making register mixing the norm rather than the exception on the web."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="henriksson-etal-2026-register">
<titleInfo>
<title>Register Mixing Is the Norm on the Web</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Henriksson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alireza</namePart>
<namePart type="family">Razzaghi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuomas</namePart>
<namePart type="family">Lundberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antti</namePart>
<namePart type="family">Kanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Laippala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sil</namePart>
<namePart type="family">Hamilton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="given">M</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Hicke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Axel</namePart>
<namePart type="family">Bax</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Matthews</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-427-9</identifier>
</relatedItem>
<abstract>Nearly all studies on web registers—online text varieties associated with characteristic social contexts and linguistic features—use full documents as the unit of analysis. However, web documents often contain sections in different registers. A cooking blog, for instance, may combine personal storytelling, recipe instructions, user comments, and promotional text within a single URL. This internal variation raises doubts about the validity of document level register labeling. In this paper, we propose an LLM-based approach that identifies register homogeneous segments within documents and apply it to a 10,000-document English sample from HPLT 3.0. We show that segmentation addresses persistent problems in register analysis, including low inter-annotator agreement and category fuzziness. Strikingly, it also reveals that most web documents contain more than one register, making register mixing the norm rather than the exception on the web.</abstract>
<identifier type="citekey">henriksson-etal-2026-register</identifier>
<location>
<url>https://aclanthology.org/2026.nlp4dh-1.14/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>138</start>
<end>149</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Register Mixing Is the Norm on the Web
%A Henriksson, Erik
%A Razzaghi, Alireza
%A Lundberg, Tuomas
%A Kanner, Antti
%A Laippala, Veronika
%Y Hamilton, Sil
%Y Öhman, Emily
%Y Hicke, Rebecca M. M.
%Y Bizzoni, Yuri
%Y Bax, Axel
%Y Matthews, Jacob A.
%Y Hämäläinen, Mika
%S Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA
%@ 979-8-89176-427-9
%F henriksson-etal-2026-register
%X Nearly all studies on web registers—online text varieties associated with characteristic social contexts and linguistic features—use full documents as the unit of analysis. However, web documents often contain sections in different registers. A cooking blog, for instance, may combine personal storytelling, recipe instructions, user comments, and promotional text within a single URL. This internal variation raises doubts about the validity of document level register labeling. In this paper, we propose an LLM-based approach that identifies register homogeneous segments within documents and apply it to a 10,000-document English sample from HPLT 3.0. We show that segmentation addresses persistent problems in register analysis, including low inter-annotator agreement and category fuzziness. Strikingly, it also reveals that most web documents contain more than one register, making register mixing the norm rather than the exception on the web.
%U https://aclanthology.org/2026.nlp4dh-1.14/
%P 138-149
Markdown (Informal)
[Register Mixing Is the Norm on the Web](https://aclanthology.org/2026.nlp4dh-1.14/) (Henriksson et al., NLP4DH 2026)
ACL
- Erik Henriksson, Alireza Razzaghi, Tuomas Lundberg, Antti Kanner, and Veronika Laippala. 2026. Register Mixing Is the Norm on the Web. In Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities, pages 138–149, San Diego, USA. Association for Computational Linguistics.