@inproceedings{xiang-etal-2024-cantonese,
title = "{C}antonese Natural Language Processing in the Transformers Era",
author = "Xiang, Rong and
Liao, Ming and
Li, Jing",
editor = "Wong, Kam-Fai and
Zhang, Min and
Xu, Ruifeng and
Li, Jing and
Wei, Zhongyu and
Gui, Lin and
Liang, Bin and
Zhao, Runcong",
booktitle = "Proceedings of the 10th SIGHAN Workshop on Chinese Language Processing (SIGHAN-10)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.sighan-1.8",
pages = "69--79",
abstract = "Despite being spoken by a large population of speakers worldwide, Cantonese is under-resourced in terms of the data scale and diversity compared to other major languages. This limitation has excluded it from the current {``}pre-training and fine-tuning{''} paradigm that is dominated by Transformer architectures.In this paper, we provide a comprehensive review on the existing resources and methodologies for Cantonese Natural Language Processing, covering the recent progress in language understanding, text generation and development of language models.We finally discuss two aspects of the Cantonese language that could make it potentially challenging even for state-of-the-art architectures: \textit{colloquialism} and \textit{multilinguality}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xiang-etal-2024-cantonese">
<titleInfo>
<title>Cantonese Natural Language Processing in the Transformers Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rong</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Liao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th SIGHAN Workshop on Chinese Language Processing (SIGHAN-10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kam-Fai</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruifeng</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongyu</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lin</namePart>
<namePart type="family">Gui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Liang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Runcong</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite being spoken by a large population of speakers worldwide, Cantonese is under-resourced in terms of the data scale and diversity compared to other major languages. This limitation has excluded it from the current “pre-training and fine-tuning” paradigm that is dominated by Transformer architectures.In this paper, we provide a comprehensive review on the existing resources and methodologies for Cantonese Natural Language Processing, covering the recent progress in language understanding, text generation and development of language models.We finally discuss two aspects of the Cantonese language that could make it potentially challenging even for state-of-the-art architectures: colloquialism and multilinguality.</abstract>
<identifier type="citekey">xiang-etal-2024-cantonese</identifier>
<location>
<url>https://aclanthology.org/2024.sighan-1.8</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>69</start>
<end>79</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cantonese Natural Language Processing in the Transformers Era
%A Xiang, Rong
%A Liao, Ming
%A Li, Jing
%Y Wong, Kam-Fai
%Y Zhang, Min
%Y Xu, Ruifeng
%Y Li, Jing
%Y Wei, Zhongyu
%Y Gui, Lin
%Y Liang, Bin
%Y Zhao, Runcong
%S Proceedings of the 10th SIGHAN Workshop on Chinese Language Processing (SIGHAN-10)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F xiang-etal-2024-cantonese
%X Despite being spoken by a large population of speakers worldwide, Cantonese is under-resourced in terms of the data scale and diversity compared to other major languages. This limitation has excluded it from the current “pre-training and fine-tuning” paradigm that is dominated by Transformer architectures.In this paper, we provide a comprehensive review on the existing resources and methodologies for Cantonese Natural Language Processing, covering the recent progress in language understanding, text generation and development of language models.We finally discuss two aspects of the Cantonese language that could make it potentially challenging even for state-of-the-art architectures: colloquialism and multilinguality.
%U https://aclanthology.org/2024.sighan-1.8
%P 69-79
Markdown (Informal)
[Cantonese Natural Language Processing in the Transformers Era](https://aclanthology.org/2024.sighan-1.8) (Xiang et al., SIGHAN-WS 2024)
ACL