@inproceedings{bafna-etal-2022-combining,
title = "Combining Noisy Semantic Signals with Orthographic Cues: Cognate Induction for the {I}ndic Dialect Continuum",
author = "Bafna, Niyati and
van Genabith, Josef and
Espa{\~n}a-Bonet, Cristina and
{\v{Z}}abokrtsk{\'y}, Zden{\v{e}}k",
editor = "Fokkens, Antske and
Srikumar, Vivek",
booktitle = "Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.conll-1.9",
doi = "10.18653/v1/2022.conll-1.9",
pages = "110--131",
abstract = "We present a novel method for unsupervised cognate/borrowing identification from monolingual corpora designed for low and extremely low resource scenarios, based on combining noisy semantic signals from joint bilingual spaces with orthographic cues modelling sound change. We apply our method to the North Indian dialect continuum, containing several dozens of dialects and languages spoken by more than 100 million people. Many of these languages are zero-resource and therefore natural language processing for them is non-existent. We first collect monolingual data for 26 Indic languages, 16 of which were previously zero-resource, and perform exploratory character, lexical and subword cross-lingual alignment experiments for the first time at this scale on this dialect continuum. We create bilingual evaluation lexicons against Hindi for 20 of the languages. We then apply our cognate identification method on the data, and show that our method outperforms both traditional orthography baselines as well as EM-style learnt edit distance matrices. To the best of our knowledge, this is the first work to combine traditional orthographic cues with noisy bilingual embeddings to tackle unsupervised cognate detection in a (truly) low-resource setup, showing that even noisy bilingual embeddings can act as good guides for this task. We release our multilingual dialect corpus, called HinDialect, as well as our scripts for evaluation data collection and cognate induction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bafna-etal-2022-combining">
<titleInfo>
<title>Combining Noisy Semantic Signals with Orthographic Cues: Cognate Induction for the Indic Dialect Continuum</title>
</titleInfo>
<name type="personal">
<namePart type="given">Niyati</namePart>
<namePart type="family">Bafna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josef</namePart>
<namePart type="family">van Genabith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cristina</namePart>
<namePart type="family">España-Bonet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zdeněk</namePart>
<namePart type="family">Žabokrtský</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antske</namePart>
<namePart type="family">Fokkens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a novel method for unsupervised cognate/borrowing identification from monolingual corpora designed for low and extremely low resource scenarios, based on combining noisy semantic signals from joint bilingual spaces with orthographic cues modelling sound change. We apply our method to the North Indian dialect continuum, containing several dozens of dialects and languages spoken by more than 100 million people. Many of these languages are zero-resource and therefore natural language processing for them is non-existent. We first collect monolingual data for 26 Indic languages, 16 of which were previously zero-resource, and perform exploratory character, lexical and subword cross-lingual alignment experiments for the first time at this scale on this dialect continuum. We create bilingual evaluation lexicons against Hindi for 20 of the languages. We then apply our cognate identification method on the data, and show that our method outperforms both traditional orthography baselines as well as EM-style learnt edit distance matrices. To the best of our knowledge, this is the first work to combine traditional orthographic cues with noisy bilingual embeddings to tackle unsupervised cognate detection in a (truly) low-resource setup, showing that even noisy bilingual embeddings can act as good guides for this task. We release our multilingual dialect corpus, called HinDialect, as well as our scripts for evaluation data collection and cognate induction.</abstract>
<identifier type="citekey">bafna-etal-2022-combining</identifier>
<identifier type="doi">10.18653/v1/2022.conll-1.9</identifier>
<location>
<url>https://aclanthology.org/2022.conll-1.9</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>110</start>
<end>131</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combining Noisy Semantic Signals with Orthographic Cues: Cognate Induction for the Indic Dialect Continuum
%A Bafna, Niyati
%A van Genabith, Josef
%A España-Bonet, Cristina
%A Žabokrtský, Zdeněk
%Y Fokkens, Antske
%Y Srikumar, Vivek
%S Proceedings of the 26th Conference on Computational Natural Language Learning (CoNLL)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F bafna-etal-2022-combining
%X We present a novel method for unsupervised cognate/borrowing identification from monolingual corpora designed for low and extremely low resource scenarios, based on combining noisy semantic signals from joint bilingual spaces with orthographic cues modelling sound change. We apply our method to the North Indian dialect continuum, containing several dozens of dialects and languages spoken by more than 100 million people. Many of these languages are zero-resource and therefore natural language processing for them is non-existent. We first collect monolingual data for 26 Indic languages, 16 of which were previously zero-resource, and perform exploratory character, lexical and subword cross-lingual alignment experiments for the first time at this scale on this dialect continuum. We create bilingual evaluation lexicons against Hindi for 20 of the languages. We then apply our cognate identification method on the data, and show that our method outperforms both traditional orthography baselines as well as EM-style learnt edit distance matrices. To the best of our knowledge, this is the first work to combine traditional orthographic cues with noisy bilingual embeddings to tackle unsupervised cognate detection in a (truly) low-resource setup, showing that even noisy bilingual embeddings can act as good guides for this task. We release our multilingual dialect corpus, called HinDialect, as well as our scripts for evaluation data collection and cognate induction.
%R 10.18653/v1/2022.conll-1.9
%U https://aclanthology.org/2022.conll-1.9
%U https://doi.org/10.18653/v1/2022.conll-1.9
%P 110-131
Markdown (Informal)
[Combining Noisy Semantic Signals with Orthographic Cues: Cognate Induction for the Indic Dialect Continuum](https://aclanthology.org/2022.conll-1.9) (Bafna et al., CoNLL 2022)
ACL