@inproceedings{myntti-etal-2024-intersecting,
title = "Intersecting Register and Genre: Understanding the Contents of Web-Crawled Corpora",
author = "Myntti, Amanda and
Repo, Liina and
Freyermuth, Elian and
Kanner, Antti and
Laippala, Veronika and
Henriksson, Erik",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Miyagawa, So and
Alnajjar, Khalid and
Bizzoni, Yuri},
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
month = nov,
year = "2024",
address = "Miami, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4dh-1.38",
pages = "386--397",
abstract = "Web-scale corpora present valuable research opportunities but often lack detailed metadata, making them challenging to use in linguistics and social sciences. This study tackles this problem by exploring automatic methods to classify web corpora into specific categories, focusing on text registers such as Interactive Discussion and literary genres such as Politics and Social Sciences. We train two machine learning models to classify documents from the large web-crawled OSCAR dataset: a register classifier using the multilingual, manually annotated CORE corpus, and a genre classifier using a dataset based on Kindle US{\&}UK. Fine-tuned from XLM-R Large, the register and genre classifiers achieved F1-scores of 0.74 and 0.70, respectively. Our analysis includes evaluating the distribution of the predicted text classes and examining the intersection of genre-register pairs using topic modelling. The results show expected combinations between certain registers and genres, such as the Lyrical register often aligning with the Literature {\&} Fiction genre. However, most registers, such as Interactive Discussion, are divided across multiple genres, like Engineering {\&} Transportation and Politics {\&} Social Sciences, depending on the discussion topic. This enriched metadata provides valuable insights and supports new ways of studying digital cultural heritage.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="myntti-etal-2024-intersecting">
<titleInfo>
<title>Intersecting Register and Genre: Understanding the Contents of Web-Crawled Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amanda</namePart>
<namePart type="family">Myntti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liina</namePart>
<namePart type="family">Repo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elian</namePart>
<namePart type="family">Freyermuth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antti</namePart>
<namePart type="family">Kanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronika</namePart>
<namePart type="family">Laippala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Henriksson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Web-scale corpora present valuable research opportunities but often lack detailed metadata, making them challenging to use in linguistics and social sciences. This study tackles this problem by exploring automatic methods to classify web corpora into specific categories, focusing on text registers such as Interactive Discussion and literary genres such as Politics and Social Sciences. We train two machine learning models to classify documents from the large web-crawled OSCAR dataset: a register classifier using the multilingual, manually annotated CORE corpus, and a genre classifier using a dataset based on Kindle US&UK. Fine-tuned from XLM-R Large, the register and genre classifiers achieved F1-scores of 0.74 and 0.70, respectively. Our analysis includes evaluating the distribution of the predicted text classes and examining the intersection of genre-register pairs using topic modelling. The results show expected combinations between certain registers and genres, such as the Lyrical register often aligning with the Literature & Fiction genre. However, most registers, such as Interactive Discussion, are divided across multiple genres, like Engineering & Transportation and Politics & Social Sciences, depending on the discussion topic. This enriched metadata provides valuable insights and supports new ways of studying digital cultural heritage.</abstract>
<identifier type="citekey">myntti-etal-2024-intersecting</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4dh-1.38</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>386</start>
<end>397</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Intersecting Register and Genre: Understanding the Contents of Web-Crawled Corpora
%A Myntti, Amanda
%A Repo, Liina
%A Freyermuth, Elian
%A Kanner, Antti
%A Laippala, Veronika
%A Henriksson, Erik
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Miyagawa, So
%Y Alnajjar, Khalid
%Y Bizzoni, Yuri
%S Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, USA
%F myntti-etal-2024-intersecting
%X Web-scale corpora present valuable research opportunities but often lack detailed metadata, making them challenging to use in linguistics and social sciences. This study tackles this problem by exploring automatic methods to classify web corpora into specific categories, focusing on text registers such as Interactive Discussion and literary genres such as Politics and Social Sciences. We train two machine learning models to classify documents from the large web-crawled OSCAR dataset: a register classifier using the multilingual, manually annotated CORE corpus, and a genre classifier using a dataset based on Kindle US&UK. Fine-tuned from XLM-R Large, the register and genre classifiers achieved F1-scores of 0.74 and 0.70, respectively. Our analysis includes evaluating the distribution of the predicted text classes and examining the intersection of genre-register pairs using topic modelling. The results show expected combinations between certain registers and genres, such as the Lyrical register often aligning with the Literature & Fiction genre. However, most registers, such as Interactive Discussion, are divided across multiple genres, like Engineering & Transportation and Politics & Social Sciences, depending on the discussion topic. This enriched metadata provides valuable insights and supports new ways of studying digital cultural heritage.
%U https://aclanthology.org/2024.nlp4dh-1.38
%P 386-397
Markdown (Informal)
[Intersecting Register and Genre: Understanding the Contents of Web-Crawled Corpora](https://aclanthology.org/2024.nlp4dh-1.38) (Myntti et al., NLP4DH 2024)
ACL