@inproceedings{osmelak-wintner-2023-denglisch,
title = "The Denglisch Corpus of {G}erman-{E}nglish Code-Switching",
author = "Osmelak, Doreen and
Wintner, Shuly",
editor = "Beinborn, Lisa and
Goswami, Koustava and
Murado{\u{g}}lu, Saliha and
Sorokin, Alexey and
Kumar, Ritesh and
Shcherbakov, Andreas and
Ponti, Edoardo M. and
Cotterell, Ryan and
Vylomova, Ekaterina",
booktitle = "Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.sigtyp-1.5",
doi = "10.18653/v1/2023.sigtyp-1.5",
pages = "42--51",
abstract = "When multilingual speakers involve in a conversation they inevitably introduce code-switching (CS), i.e., mixing of more than one language between and within utterances. CS is still an understudied phenomenon, especially in the written medium, and relatively few computational resources for studying it are available. We describe a corpus of German-English code-switching in social media interactions. We focus on some challenges in annotating CS, especially due to words whose language ID cannot be easily determined. We introduce a novel schema for such word-level annotation, with which we manually annotated a subset of the corpus. We then trained classifiers to predict and identify switches, and applied them to the remainder of the corpus. Thereby, we created a large scale corpus of German-English mixed utterances with precise indications of CS points.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="osmelak-wintner-2023-denglisch">
<titleInfo>
<title>The Denglisch Corpus of German-English Code-Switching</title>
</titleInfo>
<name type="personal">
<namePart type="given">Doreen</namePart>
<namePart type="family">Osmelak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuly</namePart>
<namePart type="family">Wintner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lisa</namePart>
<namePart type="family">Beinborn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Koustava</namePart>
<namePart type="family">Goswami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saliha</namePart>
<namePart type="family">Muradoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexey</namePart>
<namePart type="family">Sorokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ritesh</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Shcherbakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Ponti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>When multilingual speakers involve in a conversation they inevitably introduce code-switching (CS), i.e., mixing of more than one language between and within utterances. CS is still an understudied phenomenon, especially in the written medium, and relatively few computational resources for studying it are available. We describe a corpus of German-English code-switching in social media interactions. We focus on some challenges in annotating CS, especially due to words whose language ID cannot be easily determined. We introduce a novel schema for such word-level annotation, with which we manually annotated a subset of the corpus. We then trained classifiers to predict and identify switches, and applied them to the remainder of the corpus. Thereby, we created a large scale corpus of German-English mixed utterances with precise indications of CS points.</abstract>
<identifier type="citekey">osmelak-wintner-2023-denglisch</identifier>
<identifier type="doi">10.18653/v1/2023.sigtyp-1.5</identifier>
<location>
<url>https://aclanthology.org/2023.sigtyp-1.5</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>42</start>
<end>51</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Denglisch Corpus of German-English Code-Switching
%A Osmelak, Doreen
%A Wintner, Shuly
%Y Beinborn, Lisa
%Y Goswami, Koustava
%Y Muradoğlu, Saliha
%Y Sorokin, Alexey
%Y Kumar, Ritesh
%Y Shcherbakov, Andreas
%Y Ponti, Edoardo M.
%Y Cotterell, Ryan
%Y Vylomova, Ekaterina
%S Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F osmelak-wintner-2023-denglisch
%X When multilingual speakers involve in a conversation they inevitably introduce code-switching (CS), i.e., mixing of more than one language between and within utterances. CS is still an understudied phenomenon, especially in the written medium, and relatively few computational resources for studying it are available. We describe a corpus of German-English code-switching in social media interactions. We focus on some challenges in annotating CS, especially due to words whose language ID cannot be easily determined. We introduce a novel schema for such word-level annotation, with which we manually annotated a subset of the corpus. We then trained classifiers to predict and identify switches, and applied them to the remainder of the corpus. Thereby, we created a large scale corpus of German-English mixed utterances with precise indications of CS points.
%R 10.18653/v1/2023.sigtyp-1.5
%U https://aclanthology.org/2023.sigtyp-1.5
%U https://doi.org/10.18653/v1/2023.sigtyp-1.5
%P 42-51
Markdown (Informal)
[The Denglisch Corpus of German-English Code-Switching](https://aclanthology.org/2023.sigtyp-1.5) (Osmelak & Wintner, SIGTYP 2023)
ACL
- Doreen Osmelak and Shuly Wintner. 2023. The Denglisch Corpus of German-English Code-Switching. In Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP, pages 42–51, Dubrovnik, Croatia. Association for Computational Linguistics.