@inproceedings{braggaar-van-der-goot-2021-challenges,
title = "Challenges in Annotating and Parsing Spoken, Code-switched, {F}risian-{D}utch Data",
author = "Braggaar, Anouck and
van der Goot, Rob",
editor = "Ben-David, Eyal and
Cohen, Shay and
McDonald, Ryan and
Plank, Barbara and
Reichart, Roi and
Rotman, Guy and
Ziser, Yftah",
booktitle = "Proceedings of the Second Workshop on Domain Adaptation for NLP",
month = apr,
year = "2021",
address = "Kyiv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.adaptnlp-1.6",
pages = "50--58",
abstract = "While high performance have been obtained for high-resource languages, performance on low-resource languages lags behind. In this paper we focus on the parsing of the low-resource language Frisian. We use a sample of code-switched, spontaneously spoken data, which proves to be a challenging setup. We propose to train a parser specifically tailored towards the target domain, by selecting instances from multiple treebanks. Specifically, we use Latent Dirichlet Allocation (LDA), with word and character N-grams. We use a deep biaffine parser initialized with mBERT. The best single source treebank (nl{\_}alpino) resulted in an LAS of 54.7 whereas our data selection outperformed the single best transfer treebank and led to 55.6 LAS on the test data. Additional experiments consisted of removing diacritics from our Frisian data, creating more similar training data by cropping sentences and running our best model using XLM-R. These experiments did not lead to a better performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="braggaar-van-der-goot-2021-challenges">
<titleInfo>
<title>Challenges in Annotating and Parsing Spoken, Code-switched, Frisian-Dutch Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anouck</namePart>
<namePart type="family">Braggaar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">van der Goot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Domain Adaptation for NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eyal</namePart>
<namePart type="family">Ben-David</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shay</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">McDonald</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Plank</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roi</namePart>
<namePart type="family">Reichart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guy</namePart>
<namePart type="family">Rotman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yftah</namePart>
<namePart type="family">Ziser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Kyiv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While high performance have been obtained for high-resource languages, performance on low-resource languages lags behind. In this paper we focus on the parsing of the low-resource language Frisian. We use a sample of code-switched, spontaneously spoken data, which proves to be a challenging setup. We propose to train a parser specifically tailored towards the target domain, by selecting instances from multiple treebanks. Specifically, we use Latent Dirichlet Allocation (LDA), with word and character N-grams. We use a deep biaffine parser initialized with mBERT. The best single source treebank (nl_alpino) resulted in an LAS of 54.7 whereas our data selection outperformed the single best transfer treebank and led to 55.6 LAS on the test data. Additional experiments consisted of removing diacritics from our Frisian data, creating more similar training data by cropping sentences and running our best model using XLM-R. These experiments did not lead to a better performance.</abstract>
<identifier type="citekey">braggaar-van-der-goot-2021-challenges</identifier>
<location>
<url>https://aclanthology.org/2021.adaptnlp-1.6</url>
</location>
<part>
<date>2021-04</date>
<extent unit="page">
<start>50</start>
<end>58</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Challenges in Annotating and Parsing Spoken, Code-switched, Frisian-Dutch Data
%A Braggaar, Anouck
%A van der Goot, Rob
%Y Ben-David, Eyal
%Y Cohen, Shay
%Y McDonald, Ryan
%Y Plank, Barbara
%Y Reichart, Roi
%Y Rotman, Guy
%Y Ziser, Yftah
%S Proceedings of the Second Workshop on Domain Adaptation for NLP
%D 2021
%8 April
%I Association for Computational Linguistics
%C Kyiv, Ukraine
%F braggaar-van-der-goot-2021-challenges
%X While high performance have been obtained for high-resource languages, performance on low-resource languages lags behind. In this paper we focus on the parsing of the low-resource language Frisian. We use a sample of code-switched, spontaneously spoken data, which proves to be a challenging setup. We propose to train a parser specifically tailored towards the target domain, by selecting instances from multiple treebanks. Specifically, we use Latent Dirichlet Allocation (LDA), with word and character N-grams. We use a deep biaffine parser initialized with mBERT. The best single source treebank (nl_alpino) resulted in an LAS of 54.7 whereas our data selection outperformed the single best transfer treebank and led to 55.6 LAS on the test data. Additional experiments consisted of removing diacritics from our Frisian data, creating more similar training data by cropping sentences and running our best model using XLM-R. These experiments did not lead to a better performance.
%U https://aclanthology.org/2021.adaptnlp-1.6
%P 50-58
Markdown (Informal)
[Challenges in Annotating and Parsing Spoken, Code-switched, Frisian-Dutch Data](https://aclanthology.org/2021.adaptnlp-1.6) (Braggaar & van der Goot, AdaptNLP 2021)
ACL