@inproceedings{kuparinen-scherrer-2023-dialect,
title = "Dialect Representation Learning with Neural Dialect-to-Standard Normalization",
author = "Kuparinen, Olli and
Scherrer, Yves",
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, J{\"o}rg and
Zampieri, Marcos},
booktitle = "Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.vardial-1.20",
doi = "10.18653/v1/2023.vardial-1.20",
pages = "200--212",
abstract = "Language label tokens are often used in multilingual neural language modeling and sequence-to-sequence learning to enhance the performance of such models. An additional product of the technique is that the models learn representations of the language tokens, which in turn reflect the relationships between the languages. In this paper, we study the learned representations of dialects produced by neural dialect-to-standard normalization models. We use two large datasets of typologically different languages, namely Finnish and Norwegian, and evaluate the learned representations against traditional dialect divisions of both languages. We find that the inferred dialect embeddings correlate well with the traditional dialects. The methodology could be further used in noisier settings to find new insights into language variation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuparinen-scherrer-2023-dialect">
<titleInfo>
<title>Dialect Representation Learning with Neural Dialect-to-Standard Normalization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Olli</namePart>
<namePart type="family">Kuparinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language label tokens are often used in multilingual neural language modeling and sequence-to-sequence learning to enhance the performance of such models. An additional product of the technique is that the models learn representations of the language tokens, which in turn reflect the relationships between the languages. In this paper, we study the learned representations of dialects produced by neural dialect-to-standard normalization models. We use two large datasets of typologically different languages, namely Finnish and Norwegian, and evaluate the learned representations against traditional dialect divisions of both languages. We find that the inferred dialect embeddings correlate well with the traditional dialects. The methodology could be further used in noisier settings to find new insights into language variation.</abstract>
<identifier type="citekey">kuparinen-scherrer-2023-dialect</identifier>
<identifier type="doi">10.18653/v1/2023.vardial-1.20</identifier>
<location>
<url>https://aclanthology.org/2023.vardial-1.20</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>200</start>
<end>212</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dialect Representation Learning with Neural Dialect-to-Standard Normalization
%A Kuparinen, Olli
%A Scherrer, Yves
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Nakov, Preslav
%Y Tiedemann, Jörg
%Y Zampieri, Marcos
%S Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F kuparinen-scherrer-2023-dialect
%X Language label tokens are often used in multilingual neural language modeling and sequence-to-sequence learning to enhance the performance of such models. An additional product of the technique is that the models learn representations of the language tokens, which in turn reflect the relationships between the languages. In this paper, we study the learned representations of dialects produced by neural dialect-to-standard normalization models. We use two large datasets of typologically different languages, namely Finnish and Norwegian, and evaluate the learned representations against traditional dialect divisions of both languages. We find that the inferred dialect embeddings correlate well with the traditional dialects. The methodology could be further used in noisier settings to find new insights into language variation.
%R 10.18653/v1/2023.vardial-1.20
%U https://aclanthology.org/2023.vardial-1.20
%U https://doi.org/10.18653/v1/2023.vardial-1.20
%P 200-212
Markdown (Informal)
[Dialect Representation Learning with Neural Dialect-to-Standard Normalization](https://aclanthology.org/2023.vardial-1.20) (Kuparinen & Scherrer, VarDial 2023)
ACL