@inproceedings{hu-etal-2024-sjtu,
title = "{SJTU} System Description for the {WMT}24 Low-Resource Languages of {S}pain Task",
author = "Hu, Tianxiang and
Sun, Haoxiang and
Gao, Ruize and
Tang, Jialong and
Zhang, Pei and
Yang, Baosong and
Wang, Rui",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.92",
pages = "943--948",
abstract = "We participate in the translation task on Spanish to Aragonese, Spanish to Aranese and Spanish to Asturian. Initially, we conduct preliminary experiments to assess the basic translation capabilities of various models and evaluate the impact of fine-tuning with different data types. We then choose to fine-tune the Qwen2-0.5B model using a forward synthesized pseudo-corpus from the Apertium translation system to replicate its fundamental performance. Building on this distillation model, we explore three optimization strategies across the three language directions: (1) Assembling the provided FLORES+ dev sets into a 5-shot format translation training dataset and performing few-shot fine-tuning to enhance model performance. (2) Utilizing the FLORES+ dev sets as training data and applying the Contrastive Preference Optimization (CPO) strategy for further refinement. (3) Retrieving the 20 most similar translation examples from the FLORES+ dev sets using the BM25 algorithm and performing 20-shot translations with the Claude 3.5-sonnet model. After evaluating these strategies, we select the best-performing approach for each language pair as our submission result.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hu-etal-2024-sjtu">
<titleInfo>
<title>SJTU System Description for the WMT24 Low-Resource Languages of Spain Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tianxiang</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoxiang</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruize</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jialong</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pei</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baosong</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We participate in the translation task on Spanish to Aragonese, Spanish to Aranese and Spanish to Asturian. Initially, we conduct preliminary experiments to assess the basic translation capabilities of various models and evaluate the impact of fine-tuning with different data types. We then choose to fine-tune the Qwen2-0.5B model using a forward synthesized pseudo-corpus from the Apertium translation system to replicate its fundamental performance. Building on this distillation model, we explore three optimization strategies across the three language directions: (1) Assembling the provided FLORES+ dev sets into a 5-shot format translation training dataset and performing few-shot fine-tuning to enhance model performance. (2) Utilizing the FLORES+ dev sets as training data and applying the Contrastive Preference Optimization (CPO) strategy for further refinement. (3) Retrieving the 20 most similar translation examples from the FLORES+ dev sets using the BM25 algorithm and performing 20-shot translations with the Claude 3.5-sonnet model. After evaluating these strategies, we select the best-performing approach for each language pair as our submission result.</abstract>
<identifier type="citekey">hu-etal-2024-sjtu</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.92</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>943</start>
<end>948</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SJTU System Description for the WMT24 Low-Resource Languages of Spain Task
%A Hu, Tianxiang
%A Sun, Haoxiang
%A Gao, Ruize
%A Tang, Jialong
%A Zhang, Pei
%A Yang, Baosong
%A Wang, Rui
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F hu-etal-2024-sjtu
%X We participate in the translation task on Spanish to Aragonese, Spanish to Aranese and Spanish to Asturian. Initially, we conduct preliminary experiments to assess the basic translation capabilities of various models and evaluate the impact of fine-tuning with different data types. We then choose to fine-tune the Qwen2-0.5B model using a forward synthesized pseudo-corpus from the Apertium translation system to replicate its fundamental performance. Building on this distillation model, we explore three optimization strategies across the three language directions: (1) Assembling the provided FLORES+ dev sets into a 5-shot format translation training dataset and performing few-shot fine-tuning to enhance model performance. (2) Utilizing the FLORES+ dev sets as training data and applying the Contrastive Preference Optimization (CPO) strategy for further refinement. (3) Retrieving the 20 most similar translation examples from the FLORES+ dev sets using the BM25 algorithm and performing 20-shot translations with the Claude 3.5-sonnet model. After evaluating these strategies, we select the best-performing approach for each language pair as our submission result.
%U https://aclanthology.org/2024.wmt-1.92
%P 943-948
Markdown (Informal)
[SJTU System Description for the WMT24 Low-Resource Languages of Spain Task](https://aclanthology.org/2024.wmt-1.92) (Hu et al., WMT 2024)
ACL
- Tianxiang Hu, Haoxiang Sun, Ruize Gao, Jialong Tang, Pei Zhang, Baosong Yang, and Rui Wang. 2024. SJTU System Description for the WMT24 Low-Resource Languages of Spain Task. In Proceedings of the Ninth Conference on Machine Translation, pages 943–948, Miami, Florida, USA. Association for Computational Linguistics.