@inproceedings{kirstein-hansen-etal-2023-dantok,
title = "{D}an{T}ok: Domain Beats Language for {D}anish Social Media {POS} Tagging",
author = {Kirstein Hansen, Kia and
Barrett, Maria and
M{\"u}ller-Eberstein, Max and
Damgaard, Cathrine and
Eriksen, Trine and
Goot, Rob},
booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may,
year = "2023",
address = "T{\'o}rshavn, Faroe Islands",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2023.nodalida-1.27",
pages = "271--279",
abstract = "Language from social media remains challenging to process automatically, especially for non-English languages. In this work, we introduce the first NLP dataset for TikTok comments and the first Danish social media dataset with part-of-speech annotation. We further supply annotations for normalization, code-switching, and annotator uncertainty. As transferring models to such a highly specialized domain is non-trivial, we conduct an extensive study into which source data and modeling decisions most impact the performance. Surprisingly, transferring from in-domain data, even from a different language, outperforms in-language, out-of-domain training. These benefits nonetheless rely on the underlying language models having been at least partially pre-trained on data from the target language. Using our additional annotation layers, we further analyze how normalization, code-switching, and human uncertainty affect the tagging accuracy.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kirstein-hansen-etal-2023-dantok">
<titleInfo>
<title>DanTok: Domain Beats Language for Danish Social Media POS Tagging</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kia</namePart>
<namePart type="family">Kirstein Hansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Barrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Max</namePart>
<namePart type="family">Müller-Eberstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cathrine</namePart>
<namePart type="family">Damgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trine</namePart>
<namePart type="family">Eriksen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)</title>
</titleInfo>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tórshavn, Faroe Islands</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language from social media remains challenging to process automatically, especially for non-English languages. In this work, we introduce the first NLP dataset for TikTok comments and the first Danish social media dataset with part-of-speech annotation. We further supply annotations for normalization, code-switching, and annotator uncertainty. As transferring models to such a highly specialized domain is non-trivial, we conduct an extensive study into which source data and modeling decisions most impact the performance. Surprisingly, transferring from in-domain data, even from a different language, outperforms in-language, out-of-domain training. These benefits nonetheless rely on the underlying language models having been at least partially pre-trained on data from the target language. Using our additional annotation layers, we further analyze how normalization, code-switching, and human uncertainty affect the tagging accuracy.</abstract>
<identifier type="citekey">kirstein-hansen-etal-2023-dantok</identifier>
<location>
<url>https://aclanthology.org/2023.nodalida-1.27</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>271</start>
<end>279</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DanTok: Domain Beats Language for Danish Social Media POS Tagging
%A Kirstein Hansen, Kia
%A Barrett, Maria
%A Müller-Eberstein, Max
%A Damgaard, Cathrine
%A Eriksen, Trine
%A Goot, Rob
%S Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)
%D 2023
%8 May
%I University of Tartu Library
%C Tórshavn, Faroe Islands
%F kirstein-hansen-etal-2023-dantok
%X Language from social media remains challenging to process automatically, especially for non-English languages. In this work, we introduce the first NLP dataset for TikTok comments and the first Danish social media dataset with part-of-speech annotation. We further supply annotations for normalization, code-switching, and annotator uncertainty. As transferring models to such a highly specialized domain is non-trivial, we conduct an extensive study into which source data and modeling decisions most impact the performance. Surprisingly, transferring from in-domain data, even from a different language, outperforms in-language, out-of-domain training. These benefits nonetheless rely on the underlying language models having been at least partially pre-trained on data from the target language. Using our additional annotation layers, we further analyze how normalization, code-switching, and human uncertainty affect the tagging accuracy.
%U https://aclanthology.org/2023.nodalida-1.27
%P 271-279
Markdown (Informal)
[DanTok: Domain Beats Language for Danish Social Media POS Tagging](https://aclanthology.org/2023.nodalida-1.27) (Kirstein Hansen et al., NoDaLiDa 2023)
ACL
- Kia Kirstein Hansen, Maria Barrett, Max Müller-Eberstein, Cathrine Damgaard, Trine Eriksen, and Rob Goot. 2023. DanTok: Domain Beats Language for Danish Social Media POS Tagging. In Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), pages 271–279, Tórshavn, Faroe Islands. University of Tartu Library.