@inproceedings{trye-etal-2022-hybrid,
title = "A Hybrid Architecture for Labelling Bilingual {M}{\={a}}ori-{E}nglish Tweets",
author = {Trye, David and
Yogarajan, Vithya and
K{\"o}nig, Jemma and
Keegan, Te Taka and
Bainbridge, David and
Apperley, Mark},
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Findings of the Association for Computational Linguistics: AACL-IJCNLP 2022",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-aacl.11/",
doi = "10.18653/v1/2022.findings-aacl.11",
pages = "119--130",
abstract = "Most large-scale language detection tools perform poorly at identifying M{\={a}}ori text. Moreover, rule-based and machine learning-based techniques devised specifically for the M{\={a}}ori-English language pair struggle with interlingual homographs. We develop a hybrid architecture that couples M{\={a}}ori-language orthography with machine learning models in order to annotate mixed M{\={a}}ori-English text. This architecture is used to label a new bilingual Twitter corpus at both the token (word) and tweet (sentence) levels. We use the collected tweets to show that the hybrid approach outperforms existing systems with respect to language detection of interlingual homographs and overall accuracy. We also evaluate its performance on out-of-domain data. Two interactive visualisations are provided for exploring the Twitter corpus and comparing errors across the new and existing techniques. The architecture code and visualisations are available online, and the corpus is available on request."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="trye-etal-2022-hybrid">
<titleInfo>
<title>A Hybrid Architecture for Labelling Bilingual Māori-English Tweets</title>
</titleInfo>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Trye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vithya</namePart>
<namePart type="family">Yogarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jemma</namePart>
<namePart type="family">König</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Te</namePart>
<namePart type="given">Taka</namePart>
<namePart type="family">Keegan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Bainbridge</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Apperley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: AACL-IJCNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Most large-scale language detection tools perform poorly at identifying Māori text. Moreover, rule-based and machine learning-based techniques devised specifically for the Māori-English language pair struggle with interlingual homographs. We develop a hybrid architecture that couples Māori-language orthography with machine learning models in order to annotate mixed Māori-English text. This architecture is used to label a new bilingual Twitter corpus at both the token (word) and tweet (sentence) levels. We use the collected tweets to show that the hybrid approach outperforms existing systems with respect to language detection of interlingual homographs and overall accuracy. We also evaluate its performance on out-of-domain data. Two interactive visualisations are provided for exploring the Twitter corpus and comparing errors across the new and existing techniques. The architecture code and visualisations are available online, and the corpus is available on request.</abstract>
<identifier type="citekey">trye-etal-2022-hybrid</identifier>
<identifier type="doi">10.18653/v1/2022.findings-aacl.11</identifier>
<location>
<url>https://aclanthology.org/2022.findings-aacl.11/</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>119</start>
<end>130</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Hybrid Architecture for Labelling Bilingual Māori-English Tweets
%A Trye, David
%A Yogarajan, Vithya
%A König, Jemma
%A Keegan, Te Taka
%A Bainbridge, David
%A Apperley, Mark
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Findings of the Association for Computational Linguistics: AACL-IJCNLP 2022
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F trye-etal-2022-hybrid
%X Most large-scale language detection tools perform poorly at identifying Māori text. Moreover, rule-based and machine learning-based techniques devised specifically for the Māori-English language pair struggle with interlingual homographs. We develop a hybrid architecture that couples Māori-language orthography with machine learning models in order to annotate mixed Māori-English text. This architecture is used to label a new bilingual Twitter corpus at both the token (word) and tweet (sentence) levels. We use the collected tweets to show that the hybrid approach outperforms existing systems with respect to language detection of interlingual homographs and overall accuracy. We also evaluate its performance on out-of-domain data. Two interactive visualisations are provided for exploring the Twitter corpus and comparing errors across the new and existing techniques. The architecture code and visualisations are available online, and the corpus is available on request.
%R 10.18653/v1/2022.findings-aacl.11
%U https://aclanthology.org/2022.findings-aacl.11/
%U https://doi.org/10.18653/v1/2022.findings-aacl.11
%P 119-130
Markdown (Informal)
[A Hybrid Architecture for Labelling Bilingual Māori-English Tweets](https://aclanthology.org/2022.findings-aacl.11/) (Trye et al., Findings 2022)
ACL