@inproceedings{karlgren-2022-lexical,
title = "Lexical variation in {E}nglish language podcasts, editorial media, and social media",
author = "Karlgren, Jussi",
editor = "Derczynski, Leon",
booktitle = "Northern European Journal of Language Technology, Volume 8",
year = "2022",
address = "Copenhagen, Denmark",
publisher = "Northern European Association of Language Technology",
url = "https://aclanthology.org/2022.nejlt-1.8",
doi = "https://doi.org/10.3384/nejlt.2000-1533.2022.3566",
abstract = "The study presented in this paper demonstrates how transcribed podcast material differs with respect to lexical content from other collections of English language data: editorial text, social media, both long form and microblogs, dialogue from movie scripts, and transcribed phone conversations. Most of the recorded differences are as might be expected, reflecting known or assumed difference between spoken and written language, between dialogue and soliloquy, and between scripted formal and unscripted informal language use. Most notably, podcast material, compared to the hitherto typical training sets from editorial media, is characterised by being in the present tense, and with a much higher incidence of pronouns, interjections, and negations. These characteristics are, unsurprisingly, largely shared with social media texts. Where podcast material differs from social media material is in its attitudinal content, with many more amplifiers and much less negative attitude than in blog texts. This variation, besides being of philological interest, has ramifications for computational work. Information access for material which is not primarily topical should be designed to be sensitive to such variation that defines the data set itself and discriminates items within it. In general, training sets for language models are a non-trivial parameter which are likely to show effects both expected and unexpected when applied to data from other sources and the characteristics and provenance of data used to train a model should be listed on the label as a minimal form of downstream consumer protection.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karlgren-2022-lexical">
<titleInfo>
<title>Lexical variation in English language podcasts, editorial media, and social media</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jussi</namePart>
<namePart type="family">Karlgren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Northern European Journal of Language Technology, Volume 8</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Northern European Association of Language Technology</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The study presented in this paper demonstrates how transcribed podcast material differs with respect to lexical content from other collections of English language data: editorial text, social media, both long form and microblogs, dialogue from movie scripts, and transcribed phone conversations. Most of the recorded differences are as might be expected, reflecting known or assumed difference between spoken and written language, between dialogue and soliloquy, and between scripted formal and unscripted informal language use. Most notably, podcast material, compared to the hitherto typical training sets from editorial media, is characterised by being in the present tense, and with a much higher incidence of pronouns, interjections, and negations. These characteristics are, unsurprisingly, largely shared with social media texts. Where podcast material differs from social media material is in its attitudinal content, with many more amplifiers and much less negative attitude than in blog texts. This variation, besides being of philological interest, has ramifications for computational work. Information access for material which is not primarily topical should be designed to be sensitive to such variation that defines the data set itself and discriminates items within it. In general, training sets for language models are a non-trivial parameter which are likely to show effects both expected and unexpected when applied to data from other sources and the characteristics and provenance of data used to train a model should be listed on the label as a minimal form of downstream consumer protection.</abstract>
<identifier type="citekey">karlgren-2022-lexical</identifier>
<identifier type="doi">https://doi.org/10.3384/nejlt.2000-1533.2022.3566</identifier>
<location>
<url>https://aclanthology.org/2022.nejlt-1.8</url>
</location>
<part>
<date>2022</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lexical variation in English language podcasts, editorial media, and social media
%A Karlgren, Jussi
%Y Derczynski, Leon
%S Northern European Journal of Language Technology, Volume 8
%D 2022
%I Northern European Association of Language Technology
%C Copenhagen, Denmark
%F karlgren-2022-lexical
%X The study presented in this paper demonstrates how transcribed podcast material differs with respect to lexical content from other collections of English language data: editorial text, social media, both long form and microblogs, dialogue from movie scripts, and transcribed phone conversations. Most of the recorded differences are as might be expected, reflecting known or assumed difference between spoken and written language, between dialogue and soliloquy, and between scripted formal and unscripted informal language use. Most notably, podcast material, compared to the hitherto typical training sets from editorial media, is characterised by being in the present tense, and with a much higher incidence of pronouns, interjections, and negations. These characteristics are, unsurprisingly, largely shared with social media texts. Where podcast material differs from social media material is in its attitudinal content, with many more amplifiers and much less negative attitude than in blog texts. This variation, besides being of philological interest, has ramifications for computational work. Information access for material which is not primarily topical should be designed to be sensitive to such variation that defines the data set itself and discriminates items within it. In general, training sets for language models are a non-trivial parameter which are likely to show effects both expected and unexpected when applied to data from other sources and the characteristics and provenance of data used to train a model should be listed on the label as a minimal form of downstream consumer protection.
%R https://doi.org/10.3384/nejlt.2000-1533.2022.3566
%U https://aclanthology.org/2022.nejlt-1.8
%U https://doi.org/https://doi.org/10.3384/nejlt.2000-1533.2022.3566
Markdown (Informal)
[Lexical variation in English language podcasts, editorial media, and social media](https://aclanthology.org/2022.nejlt-1.8) (Karlgren, NEJLT 2022)
ACL