@inproceedings{messner-lippincott-2024-examining,
title = "Examining Language Modeling Assumptions Using an Annotated Literary Dialect Corpus",
author = "Messner, Craig and
Lippincott, Thomas",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Miyagawa, So and
Alnajjar, Khalid and
Bizzoni, Yuri},
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
month = nov,
year = "2024",
address = "Miami, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4dh-1.32",
pages = "325--330",
abstract = "We present a dataset of 19th century American literary orthovariant tokens with a novel layer of human-annotated dialect group tags designed to serve as the basis for computational experiments exploring literarily meaningful orthographic variation. We perform an initial broad set of experiments over this dataset using both token (BERT) and character (CANINE)-level contextual language models. We find indications that the {``}dialect effect{''} produced by intentional orthographic variation employs multiple linguistic channels, and that these channels are able to be surfaced to varied degrees given particular language modelling assumptions. Specifically, we find evidence showing that choice of tokenization scheme meaningfully impact the type of orthographic information a model is able to surface.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="messner-lippincott-2024-examining">
<titleInfo>
<title>Examining Language Modeling Assumptions Using an Annotated Literary Dialect Corpus</title>
</titleInfo>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Messner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Lippincott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a dataset of 19th century American literary orthovariant tokens with a novel layer of human-annotated dialect group tags designed to serve as the basis for computational experiments exploring literarily meaningful orthographic variation. We perform an initial broad set of experiments over this dataset using both token (BERT) and character (CANINE)-level contextual language models. We find indications that the “dialect effect” produced by intentional orthographic variation employs multiple linguistic channels, and that these channels are able to be surfaced to varied degrees given particular language modelling assumptions. Specifically, we find evidence showing that choice of tokenization scheme meaningfully impact the type of orthographic information a model is able to surface.</abstract>
<identifier type="citekey">messner-lippincott-2024-examining</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4dh-1.32</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>325</start>
<end>330</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Examining Language Modeling Assumptions Using an Annotated Literary Dialect Corpus
%A Messner, Craig
%A Lippincott, Thomas
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Miyagawa, So
%Y Alnajjar, Khalid
%Y Bizzoni, Yuri
%S Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, USA
%F messner-lippincott-2024-examining
%X We present a dataset of 19th century American literary orthovariant tokens with a novel layer of human-annotated dialect group tags designed to serve as the basis for computational experiments exploring literarily meaningful orthographic variation. We perform an initial broad set of experiments over this dataset using both token (BERT) and character (CANINE)-level contextual language models. We find indications that the “dialect effect” produced by intentional orthographic variation employs multiple linguistic channels, and that these channels are able to be surfaced to varied degrees given particular language modelling assumptions. Specifically, we find evidence showing that choice of tokenization scheme meaningfully impact the type of orthographic information a model is able to surface.
%U https://aclanthology.org/2024.nlp4dh-1.32
%P 325-330
Markdown (Informal)
[Examining Language Modeling Assumptions Using an Annotated Literary Dialect Corpus](https://aclanthology.org/2024.nlp4dh-1.32) (Messner & Lippincott, NLP4DH 2024)
ACL