@inproceedings{kuparinen-2024-murre24,
title = "Murre24: Dialect Identification of {F}innish {I}nternet Forum Messages",
author = "Kuparinen, Olli",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.1048",
pages = "12003--12015",
abstract = "This paper presents Murre24, a collection of dialectal messages posted on the largest Finnish internet forum, Suomi24. The messages posted in Finnish on the forum between 2001 and 2020 are classified to present either the standard language, one of the seven traditional dialects, a colloquial style or the Helsinki slang. We present a manually annotated dataset used to train dialect identification models as well as the automatic annotation of almost 94 million messages in total. We experiment with five different dialect identification methods and evaluate them on dialectally balanced and random test samples. The best performing method for differentiating standard Finnish from non-standard Finnish is a character n-gram based support vector machine (SVM), while fine-tuning a BERT-based model achieves best scores in the final dialect identification task. According to the automatic classification, most of the messages written on the forum are in standard Finnish, and most of the non-standard messages are in a colloquial variety used typically by young speakers in Finland. We moreover show that the proportion of non-standard messages declines over time, but the proportion of the traditional dialects stays relatively steady.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuparinen-2024-murre24">
<titleInfo>
<title>Murre24: Dialect Identification of Finnish Internet Forum Messages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Olli</namePart>
<namePart type="family">Kuparinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents Murre24, a collection of dialectal messages posted on the largest Finnish internet forum, Suomi24. The messages posted in Finnish on the forum between 2001 and 2020 are classified to present either the standard language, one of the seven traditional dialects, a colloquial style or the Helsinki slang. We present a manually annotated dataset used to train dialect identification models as well as the automatic annotation of almost 94 million messages in total. We experiment with five different dialect identification methods and evaluate them on dialectally balanced and random test samples. The best performing method for differentiating standard Finnish from non-standard Finnish is a character n-gram based support vector machine (SVM), while fine-tuning a BERT-based model achieves best scores in the final dialect identification task. According to the automatic classification, most of the messages written on the forum are in standard Finnish, and most of the non-standard messages are in a colloquial variety used typically by young speakers in Finland. We moreover show that the proportion of non-standard messages declines over time, but the proportion of the traditional dialects stays relatively steady.</abstract>
<identifier type="citekey">kuparinen-2024-murre24</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.1048</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>12003</start>
<end>12015</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Murre24: Dialect Identification of Finnish Internet Forum Messages
%A Kuparinen, Olli
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F kuparinen-2024-murre24
%X This paper presents Murre24, a collection of dialectal messages posted on the largest Finnish internet forum, Suomi24. The messages posted in Finnish on the forum between 2001 and 2020 are classified to present either the standard language, one of the seven traditional dialects, a colloquial style or the Helsinki slang. We present a manually annotated dataset used to train dialect identification models as well as the automatic annotation of almost 94 million messages in total. We experiment with five different dialect identification methods and evaluate them on dialectally balanced and random test samples. The best performing method for differentiating standard Finnish from non-standard Finnish is a character n-gram based support vector machine (SVM), while fine-tuning a BERT-based model achieves best scores in the final dialect identification task. According to the automatic classification, most of the messages written on the forum are in standard Finnish, and most of the non-standard messages are in a colloquial variety used typically by young speakers in Finland. We moreover show that the proportion of non-standard messages declines over time, but the proportion of the traditional dialects stays relatively steady.
%U https://aclanthology.org/2024.lrec-main.1048
%P 12003-12015
Markdown (Informal)
[Murre24: Dialect Identification of Finnish Internet Forum Messages](https://aclanthology.org/2024.lrec-main.1048) (Kuparinen, LREC-COLING 2024)
ACL