@inproceedings{yu-etal-2024-machine,
title = "Machine Translation Evaluation Benchmark for {W}u {C}hinese: Workflow and Analysis",
author = "Yu, Hongjian and
Shi, Yiming and
Zhou, Zherui and
Haberland, Christopher",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.47",
pages = "600--605",
abstract = "We introduce a FLORES+ dataset as an evaluation benchmark for modern Wu Chinese machine translation models and showcase its compatibility with existing Wu data. Wu Chinese is mutually unintelligible with other Sinitic languages such as Mandarin and Yue (Cantonese), but uses a set of Hanzi (Chinese characters) that profoundly overlaps with others. The population of Wu speakers is the second largest among languages in China, but the language has been suffering from significant drop in usage especially among the younger generations. We identify Wu Chinese as a textually low-resource language and address challenges for its machine translation models. Our contributions include: (1) an open-source, manually translated dataset, (2) full documentations on the process of dataset creation and validation experiments, (3) preliminary tools for Wu Chinese normalization and segmentation, and (4) benefits and limitations of our dataset, as well as implications to other low-resource languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2024-machine">
<titleInfo>
<title>Machine Translation Evaluation Benchmark for Wu Chinese: Workflow and Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hongjian</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiming</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zherui</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Haberland</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce a FLORES+ dataset as an evaluation benchmark for modern Wu Chinese machine translation models and showcase its compatibility with existing Wu data. Wu Chinese is mutually unintelligible with other Sinitic languages such as Mandarin and Yue (Cantonese), but uses a set of Hanzi (Chinese characters) that profoundly overlaps with others. The population of Wu speakers is the second largest among languages in China, but the language has been suffering from significant drop in usage especially among the younger generations. We identify Wu Chinese as a textually low-resource language and address challenges for its machine translation models. Our contributions include: (1) an open-source, manually translated dataset, (2) full documentations on the process of dataset creation and validation experiments, (3) preliminary tools for Wu Chinese normalization and segmentation, and (4) benefits and limitations of our dataset, as well as implications to other low-resource languages.</abstract>
<identifier type="citekey">yu-etal-2024-machine</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.47</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>600</start>
<end>605</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Machine Translation Evaluation Benchmark for Wu Chinese: Workflow and Analysis
%A Yu, Hongjian
%A Shi, Yiming
%A Zhou, Zherui
%A Haberland, Christopher
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F yu-etal-2024-machine
%X We introduce a FLORES+ dataset as an evaluation benchmark for modern Wu Chinese machine translation models and showcase its compatibility with existing Wu data. Wu Chinese is mutually unintelligible with other Sinitic languages such as Mandarin and Yue (Cantonese), but uses a set of Hanzi (Chinese characters) that profoundly overlaps with others. The population of Wu speakers is the second largest among languages in China, but the language has been suffering from significant drop in usage especially among the younger generations. We identify Wu Chinese as a textually low-resource language and address challenges for its machine translation models. Our contributions include: (1) an open-source, manually translated dataset, (2) full documentations on the process of dataset creation and validation experiments, (3) preliminary tools for Wu Chinese normalization and segmentation, and (4) benefits and limitations of our dataset, as well as implications to other low-resource languages.
%U https://aclanthology.org/2024.wmt-1.47
%P 600-605
Markdown (Informal)
[Machine Translation Evaluation Benchmark for Wu Chinese: Workflow and Analysis](https://aclanthology.org/2024.wmt-1.47) (Yu et al., WMT 2024)
ACL