@inproceedings{chen-bjerva-2023-colexifications,
title = "Colexifications for Bootstrapping Cross-lingual Datasets: The Case of Phonology, Concreteness, and Affectiveness",
author = "Chen, Yiyi and
Bjerva, Johannes",
editor = {Nicolai, Garrett and
Chodroff, Eleanor and
Mailhot, Frederic and
{\c{C}}{\"o}ltekin, {\c{C}}a{\u{g}}r{\i}},
booktitle = "Proceedings of the 20th SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.sigmorphon-1.11",
doi = "10.18653/v1/2023.sigmorphon-1.11",
pages = "98--109",
abstract = "Colexification refers to the linguistic phenomenon where a single lexical form is used to convey multiple meanings. By studying cross-lingual colexifications, researchers have gained valuable insights into fields such as psycholinguistics and cognitive sciences (Jack- son et al., 2019; Xu et al., 2020; Karjus et al., 2021; Schapper and Koptjevskaja-Tamm, 2022; Fran{\~A}{\S}ois, 2022). While several multilingual colexification datasets exist, there is untapped potential in using this information to bootstrap datasets across such semantic features. In this paper, we aim to demonstrate how colexifications can be leveraged to create such cross-lingual datasets. We showcase curation procedures which result in a dataset covering 142 languages across 21 language families across the world. The dataset includes ratings of concreteness and affectiveness, mapped with phonemes and phonological features. We further analyze the dataset along different dimensions to demonstrate potential of the proposed procedures in facilitating further interdisciplinary research in psychology, cognitive science, and multilingual natural language processing (NLP). Based on initial investigations, we observe that i) colexifications that are closer in concreteness/affectiveness are more likely to colexify ; ii) certain initial/last phonemes are significantly correlated with concreteness/affectiveness intra language families, such as /k/ as the initial phoneme in both Turkic and Tai-Kadai correlated with concreteness, and /p/ in Dravidian and Sino-Tibetan correlated with Valence; iii) the type-to-token ratio (TTR) of phonemes are positively correlated with concreteness across several language families, while the length of phoneme segments are negatively correlated with concreteness; iv) certain phonological features are negatively correlated with concreteness across languages. The dataset is made public online for further research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-bjerva-2023-colexifications">
<titleInfo>
<title>Colexifications for Bootstrapping Cross-lingual Datasets: The Case of Phonology, Concreteness, and Affectiveness</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiyi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johannes</namePart>
<namePart type="family">Bjerva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Garrett</namePart>
<namePart type="family">Nicolai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eleanor</namePart>
<namePart type="family">Chodroff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Mailhot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Çağrı</namePart>
<namePart type="family">Çöltekin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Colexification refers to the linguistic phenomenon where a single lexical form is used to convey multiple meanings. By studying cross-lingual colexifications, researchers have gained valuable insights into fields such as psycholinguistics and cognitive sciences (Jack- son et al., 2019; Xu et al., 2020; Karjus et al., 2021; Schapper and Koptjevskaja-Tamm, 2022; François, 2022). While several multilingual colexification datasets exist, there is untapped potential in using this information to bootstrap datasets across such semantic features. In this paper, we aim to demonstrate how colexifications can be leveraged to create such cross-lingual datasets. We showcase curation procedures which result in a dataset covering 142 languages across 21 language families across the world. The dataset includes ratings of concreteness and affectiveness, mapped with phonemes and phonological features. We further analyze the dataset along different dimensions to demonstrate potential of the proposed procedures in facilitating further interdisciplinary research in psychology, cognitive science, and multilingual natural language processing (NLP). Based on initial investigations, we observe that i) colexifications that are closer in concreteness/affectiveness are more likely to colexify ; ii) certain initial/last phonemes are significantly correlated with concreteness/affectiveness intra language families, such as /k/ as the initial phoneme in both Turkic and Tai-Kadai correlated with concreteness, and /p/ in Dravidian and Sino-Tibetan correlated with Valence; iii) the type-to-token ratio (TTR) of phonemes are positively correlated with concreteness across several language families, while the length of phoneme segments are negatively correlated with concreteness; iv) certain phonological features are negatively correlated with concreteness across languages. The dataset is made public online for further research.</abstract>
<identifier type="citekey">chen-bjerva-2023-colexifications</identifier>
<identifier type="doi">10.18653/v1/2023.sigmorphon-1.11</identifier>
<location>
<url>https://aclanthology.org/2023.sigmorphon-1.11</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>98</start>
<end>109</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Colexifications for Bootstrapping Cross-lingual Datasets: The Case of Phonology, Concreteness, and Affectiveness
%A Chen, Yiyi
%A Bjerva, Johannes
%Y Nicolai, Garrett
%Y Chodroff, Eleanor
%Y Mailhot, Frederic
%Y Çöltekin, Çağrı
%S Proceedings of the 20th SIGMORPHON workshop on Computational Research in Phonetics, Phonology, and Morphology
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F chen-bjerva-2023-colexifications
%X Colexification refers to the linguistic phenomenon where a single lexical form is used to convey multiple meanings. By studying cross-lingual colexifications, researchers have gained valuable insights into fields such as psycholinguistics and cognitive sciences (Jack- son et al., 2019; Xu et al., 2020; Karjus et al., 2021; Schapper and Koptjevskaja-Tamm, 2022; François, 2022). While several multilingual colexification datasets exist, there is untapped potential in using this information to bootstrap datasets across such semantic features. In this paper, we aim to demonstrate how colexifications can be leveraged to create such cross-lingual datasets. We showcase curation procedures which result in a dataset covering 142 languages across 21 language families across the world. The dataset includes ratings of concreteness and affectiveness, mapped with phonemes and phonological features. We further analyze the dataset along different dimensions to demonstrate potential of the proposed procedures in facilitating further interdisciplinary research in psychology, cognitive science, and multilingual natural language processing (NLP). Based on initial investigations, we observe that i) colexifications that are closer in concreteness/affectiveness are more likely to colexify ; ii) certain initial/last phonemes are significantly correlated with concreteness/affectiveness intra language families, such as /k/ as the initial phoneme in both Turkic and Tai-Kadai correlated with concreteness, and /p/ in Dravidian and Sino-Tibetan correlated with Valence; iii) the type-to-token ratio (TTR) of phonemes are positively correlated with concreteness across several language families, while the length of phoneme segments are negatively correlated with concreteness; iv) certain phonological features are negatively correlated with concreteness across languages. The dataset is made public online for further research.
%R 10.18653/v1/2023.sigmorphon-1.11
%U https://aclanthology.org/2023.sigmorphon-1.11
%U https://doi.org/10.18653/v1/2023.sigmorphon-1.11
%P 98-109
Markdown (Informal)
[Colexifications for Bootstrapping Cross-lingual Datasets: The Case of Phonology, Concreteness, and Affectiveness](https://aclanthology.org/2023.sigmorphon-1.11) (Chen & Bjerva, SIGMORPHON 2023)
ACL