@inproceedings{james-etal-2022-language,
title = "Language Models for Code-switch Detection of te reo {M}{\=a}ori and {E}nglish in a Low-resource Setting",
author = "James, Jesin and
Yogarajan, Vithya and
Shields, Isabella and
Watson, Catherine and
Keegan, Peter and
Mahelona, Keoni and
Jones, Peter-Lucas",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-naacl.49",
doi = "10.18653/v1/2022.findings-naacl.49",
pages = "650--660",
abstract = "Te reo M{\=a}ori, New Zealand{'}s only indigenous language, is code-switched with English. M{\=a}ori speakers are atleast bilingual, and the use of M{\=a}ori is increasing in New Zealand English. Unfortunately, due to the minimal availability of resources, including digital data, M{\=a}ori is under-represented in technological advances. Cloud-based multilingual systems such as Google and Microsoft Azure support M{\=a}ori language detection. However, we provide experimental evidence to show that the accuracy of such systems is low when detecting M{\=a}ori. Hence, with the support of M{\=a}ori community, we collect M{\=a}ori and bilingual data to use natural language processing (NLP) to improve M{\=a}ori language detection. We train bilingual sub-word embeddings and provide evidence to show that our bilingual embeddings improve overall accuracy compared to the publicly-available monolingual embeddings. This improvement has been verified for various NLP tasks using three bilingual databases containing formal transcripts and informal social media data. We also show that BiLSTM with pre-trained M{\=a}ori-English sub-word embeddings outperforms large-scale contextual language models such as BERT on down streaming tasks of detecting M{\=a}ori language. However, this research uses large models {`}as is{'} for transfer learning, where no further training was done on M{\=a}ori-English data. The best accuracy of 87{\%} was obtained using BiLSTM with bilingual embeddings to detect M{\=a}ori-English code-switching points.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="james-etal-2022-language">
<titleInfo>
<title>Language Models for Code-switch Detection of te reo Māori and English in a Low-resource Setting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jesin</namePart>
<namePart type="family">James</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vithya</namePart>
<namePart type="family">Yogarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabella</namePart>
<namePart type="family">Shields</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Catherine</namePart>
<namePart type="family">Watson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Keegan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Keoni</namePart>
<namePart type="family">Mahelona</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter-Lucas</namePart>
<namePart type="family">Jones</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Catherine</namePart>
<namePart type="family">de Marneffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="given">Vladimir</namePart>
<namePart type="family">Meza Ruiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Seattle, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Te reo Māori, New Zealand’s only indigenous language, is code-switched with English. Māori speakers are atleast bilingual, and the use of Māori is increasing in New Zealand English. Unfortunately, due to the minimal availability of resources, including digital data, Māori is under-represented in technological advances. Cloud-based multilingual systems such as Google and Microsoft Azure support Māori language detection. However, we provide experimental evidence to show that the accuracy of such systems is low when detecting Māori. Hence, with the support of Māori community, we collect Māori and bilingual data to use natural language processing (NLP) to improve Māori language detection. We train bilingual sub-word embeddings and provide evidence to show that our bilingual embeddings improve overall accuracy compared to the publicly-available monolingual embeddings. This improvement has been verified for various NLP tasks using three bilingual databases containing formal transcripts and informal social media data. We also show that BiLSTM with pre-trained Māori-English sub-word embeddings outperforms large-scale contextual language models such as BERT on down streaming tasks of detecting Māori language. However, this research uses large models ‘as is’ for transfer learning, where no further training was done on Māori-English data. The best accuracy of 87% was obtained using BiLSTM with bilingual embeddings to detect Māori-English code-switching points.</abstract>
<identifier type="citekey">james-etal-2022-language</identifier>
<identifier type="doi">10.18653/v1/2022.findings-naacl.49</identifier>
<location>
<url>https://aclanthology.org/2022.findings-naacl.49</url>
</location>
<part>
<date>2022-07</date>
<extent unit="page">
<start>650</start>
<end>660</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Models for Code-switch Detection of te reo Māori and English in a Low-resource Setting
%A James, Jesin
%A Yogarajan, Vithya
%A Shields, Isabella
%A Watson, Catherine
%A Keegan, Peter
%A Mahelona, Keoni
%A Jones, Peter-Lucas
%Y Carpuat, Marine
%Y de Marneffe, Marie-Catherine
%Y Meza Ruiz, Ivan Vladimir
%S Findings of the Association for Computational Linguistics: NAACL 2022
%D 2022
%8 July
%I Association for Computational Linguistics
%C Seattle, United States
%F james-etal-2022-language
%X Te reo Māori, New Zealand’s only indigenous language, is code-switched with English. Māori speakers are atleast bilingual, and the use of Māori is increasing in New Zealand English. Unfortunately, due to the minimal availability of resources, including digital data, Māori is under-represented in technological advances. Cloud-based multilingual systems such as Google and Microsoft Azure support Māori language detection. However, we provide experimental evidence to show that the accuracy of such systems is low when detecting Māori. Hence, with the support of Māori community, we collect Māori and bilingual data to use natural language processing (NLP) to improve Māori language detection. We train bilingual sub-word embeddings and provide evidence to show that our bilingual embeddings improve overall accuracy compared to the publicly-available monolingual embeddings. This improvement has been verified for various NLP tasks using three bilingual databases containing formal transcripts and informal social media data. We also show that BiLSTM with pre-trained Māori-English sub-word embeddings outperforms large-scale contextual language models such as BERT on down streaming tasks of detecting Māori language. However, this research uses large models ‘as is’ for transfer learning, where no further training was done on Māori-English data. The best accuracy of 87% was obtained using BiLSTM with bilingual embeddings to detect Māori-English code-switching points.
%R 10.18653/v1/2022.findings-naacl.49
%U https://aclanthology.org/2022.findings-naacl.49
%U https://doi.org/10.18653/v1/2022.findings-naacl.49
%P 650-660
Markdown (Informal)
[Language Models for Code-switch Detection of te reo Māori and English in a Low-resource Setting](https://aclanthology.org/2022.findings-naacl.49) (James et al., Findings 2022)
ACL