@inproceedings{dai-etal-2025-next,
title = "Next-Level {C}antonese-to-{M}andarin Translation: Fine-Tuning and Post-Processing with {LLM}s",
author = "Dai, Yuqian and
Chan, Chun Fai and
Wong, Ying Ki and
Pun, Tsz Ho",
editor = "Hettiarachchi, Hansi and
Ranasinghe, Tharindu and
Rayson, Paul and
Mitkov, Ruslan and
Gaber, Mohamed and
Premasiri, Damith and
Tan, Fiona Anting and
Uyangodage, Lasitha",
booktitle = "Proceedings of the First Workshop on Language Models for Low-Resource Languages",
month = jan,
year = "2025",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.loreslm-1.32/",
pages = "427--436",
abstract = "Large Language Models (LLMs) have improved performance across various natural language processing tasks. Despite these improvements, LLMs continue to face significant challenges, such as grammatical issues and code-switching to English, when applied to low-resource languages like Cantonese in Machine Translation (MT) scenarios. By addressing the unique linguistic and contextual challenges of Cantonese, we present a novel strategy to improve the understanding and translation capabilities of LLMs for Cantonese-to-Mandarin MT. Our strategy comprises three key components: (1) Syntax and Part-of-Speech (POS) fine-tuning, where we use the Universal Dependencies (UD) corpus to fine-tune LLM, focusing on the linguistic structures of Cantonese; (2) Specialized Cantonese to Mandarin sentence pairs, collected from diverse sources such as Cantonese grammar textbooks and manually translated sentences across various domains, to expose the model to a wide range of linguistic contexts; (3) Post-processing with additional LLMs, where we introduce additional LLMs to improve the initial translations, correcting Mandarin grammar and punctuation. Empirical evaluations on human-created test sets show that our proposed strategy improves translation performance and outperforms existing commercial translation models with at least 3 BLEU scores. Additionally, our strategy also benefits other LLMs and a reversed translation direction, demonstrating its generalization and effectiveness."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dai-etal-2025-next">
<titleInfo>
<title>Next-Level Cantonese-to-Mandarin Translation: Fine-Tuning and Post-Processing with LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuqian</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chun</namePart>
<namePart type="given">Fai</namePart>
<namePart type="family">Chan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="given">Ki</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsz</namePart>
<namePart type="given">Ho</namePart>
<namePart type="family">Pun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Language Models for Low-Resource Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hansi</namePart>
<namePart type="family">Hettiarachchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tharindu</namePart>
<namePart type="family">Ranasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Gaber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damith</namePart>
<namePart type="family">Premasiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fiona</namePart>
<namePart type="given">Anting</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lasitha</namePart>
<namePart type="family">Uyangodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) have improved performance across various natural language processing tasks. Despite these improvements, LLMs continue to face significant challenges, such as grammatical issues and code-switching to English, when applied to low-resource languages like Cantonese in Machine Translation (MT) scenarios. By addressing the unique linguistic and contextual challenges of Cantonese, we present a novel strategy to improve the understanding and translation capabilities of LLMs for Cantonese-to-Mandarin MT. Our strategy comprises three key components: (1) Syntax and Part-of-Speech (POS) fine-tuning, where we use the Universal Dependencies (UD) corpus to fine-tune LLM, focusing on the linguistic structures of Cantonese; (2) Specialized Cantonese to Mandarin sentence pairs, collected from diverse sources such as Cantonese grammar textbooks and manually translated sentences across various domains, to expose the model to a wide range of linguistic contexts; (3) Post-processing with additional LLMs, where we introduce additional LLMs to improve the initial translations, correcting Mandarin grammar and punctuation. Empirical evaluations on human-created test sets show that our proposed strategy improves translation performance and outperforms existing commercial translation models with at least 3 BLEU scores. Additionally, our strategy also benefits other LLMs and a reversed translation direction, demonstrating its generalization and effectiveness.</abstract>
<identifier type="citekey">dai-etal-2025-next</identifier>
<location>
<url>https://aclanthology.org/2025.loreslm-1.32/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>427</start>
<end>436</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Next-Level Cantonese-to-Mandarin Translation: Fine-Tuning and Post-Processing with LLMs
%A Dai, Yuqian
%A Chan, Chun Fai
%A Wong, Ying Ki
%A Pun, Tsz Ho
%Y Hettiarachchi, Hansi
%Y Ranasinghe, Tharindu
%Y Rayson, Paul
%Y Mitkov, Ruslan
%Y Gaber, Mohamed
%Y Premasiri, Damith
%Y Tan, Fiona Anting
%Y Uyangodage, Lasitha
%S Proceedings of the First Workshop on Language Models for Low-Resource Languages
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F dai-etal-2025-next
%X Large Language Models (LLMs) have improved performance across various natural language processing tasks. Despite these improvements, LLMs continue to face significant challenges, such as grammatical issues and code-switching to English, when applied to low-resource languages like Cantonese in Machine Translation (MT) scenarios. By addressing the unique linguistic and contextual challenges of Cantonese, we present a novel strategy to improve the understanding and translation capabilities of LLMs for Cantonese-to-Mandarin MT. Our strategy comprises three key components: (1) Syntax and Part-of-Speech (POS) fine-tuning, where we use the Universal Dependencies (UD) corpus to fine-tune LLM, focusing on the linguistic structures of Cantonese; (2) Specialized Cantonese to Mandarin sentence pairs, collected from diverse sources such as Cantonese grammar textbooks and manually translated sentences across various domains, to expose the model to a wide range of linguistic contexts; (3) Post-processing with additional LLMs, where we introduce additional LLMs to improve the initial translations, correcting Mandarin grammar and punctuation. Empirical evaluations on human-created test sets show that our proposed strategy improves translation performance and outperforms existing commercial translation models with at least 3 BLEU scores. Additionally, our strategy also benefits other LLMs and a reversed translation direction, demonstrating its generalization and effectiveness.
%U https://aclanthology.org/2025.loreslm-1.32/
%P 427-436
Markdown (Informal)
[Next-Level Cantonese-to-Mandarin Translation: Fine-Tuning and Post-Processing with LLMs](https://aclanthology.org/2025.loreslm-1.32/) (Dai et al., LoResLM 2025)
ACL