@inproceedings{zhao-etal-2025-neural,
title = "Neural Machine Translation for Agglutinative Languages via Data Rejuvenation",
author = "Zhao, Chen and
Ji, Yatu and
Qing-Dao-Er-Ji, Ren and
Wu, Nier and
Shi, Lei and
Liu, Fu and
Jia, Yepai",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-srw.33/",
doi = "10.18653/v1/2025.acl-srw.33",
pages = "508--516",
ISBN = "979-8-89176-254-1",
abstract = "In Recent years, advances in Neural Machine Translation (NMT) heavily rely on large-scale parallel corpora. Within the context of China{'}s Belt and Road Initiative, there is increasing demand for improving translation quality from agglutinative languages (e.g., Mongolian, Arabic) to Chinese. However, the translation scenarios for agglutinative languages (which form words by concatenating morphemes with clear boundaries) face significant challenges including data sparsity, quality imbalance, and inactive sample proliferation due to their morphological complexity and syntactic flexibility. This study presents a systematic analysis of data distribution characteristics in agglutinative languages and proposes a dual-module framework combining fine-grained inactive sample identification with target-side rejuvenation. Our framework first establishes a multi-dimensional evaluation system to accurately identify samples exhibiting low-frequency morphological interference or long-range word order mismatches. Subsequently, the target-side rejuvenation mechanism generates diversified noise-resistant translations through iterative optimization of sample contribution weights. Experimental results on four low-resource agglutinative language tasks demonstrate significant performance improvements (BLEU +2.1{--}3.4) across mainstream NMT architectures. Architecture-agnostic validation further confirms the framework{'}s generalizability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhao-etal-2025-neural">
<titleInfo>
<title>Neural Machine Translation for Agglutinative Languages via Data Rejuvenation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yatu</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ren</namePart>
<namePart type="family">Qing-Dao-Er-Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nier</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lei</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yepai</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-254-1</identifier>
</relatedItem>
<abstract>In Recent years, advances in Neural Machine Translation (NMT) heavily rely on large-scale parallel corpora. Within the context of China’s Belt and Road Initiative, there is increasing demand for improving translation quality from agglutinative languages (e.g., Mongolian, Arabic) to Chinese. However, the translation scenarios for agglutinative languages (which form words by concatenating morphemes with clear boundaries) face significant challenges including data sparsity, quality imbalance, and inactive sample proliferation due to their morphological complexity and syntactic flexibility. This study presents a systematic analysis of data distribution characteristics in agglutinative languages and proposes a dual-module framework combining fine-grained inactive sample identification with target-side rejuvenation. Our framework first establishes a multi-dimensional evaluation system to accurately identify samples exhibiting low-frequency morphological interference or long-range word order mismatches. Subsequently, the target-side rejuvenation mechanism generates diversified noise-resistant translations through iterative optimization of sample contribution weights. Experimental results on four low-resource agglutinative language tasks demonstrate significant performance improvements (BLEU +2.1–3.4) across mainstream NMT architectures. Architecture-agnostic validation further confirms the framework’s generalizability.</abstract>
<identifier type="citekey">zhao-etal-2025-neural</identifier>
<identifier type="doi">10.18653/v1/2025.acl-srw.33</identifier>
<location>
<url>https://aclanthology.org/2025.acl-srw.33/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>508</start>
<end>516</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Neural Machine Translation for Agglutinative Languages via Data Rejuvenation
%A Zhao, Chen
%A Ji, Yatu
%A Qing-Dao-Er-Ji, Ren
%A Wu, Nier
%A Shi, Lei
%A Liu, Fu
%A Jia, Yepai
%Y Zhao, Jin
%Y Wang, Mingyang
%Y Liu, Zhu
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-254-1
%F zhao-etal-2025-neural
%X In Recent years, advances in Neural Machine Translation (NMT) heavily rely on large-scale parallel corpora. Within the context of China’s Belt and Road Initiative, there is increasing demand for improving translation quality from agglutinative languages (e.g., Mongolian, Arabic) to Chinese. However, the translation scenarios for agglutinative languages (which form words by concatenating morphemes with clear boundaries) face significant challenges including data sparsity, quality imbalance, and inactive sample proliferation due to their morphological complexity and syntactic flexibility. This study presents a systematic analysis of data distribution characteristics in agglutinative languages and proposes a dual-module framework combining fine-grained inactive sample identification with target-side rejuvenation. Our framework first establishes a multi-dimensional evaluation system to accurately identify samples exhibiting low-frequency morphological interference or long-range word order mismatches. Subsequently, the target-side rejuvenation mechanism generates diversified noise-resistant translations through iterative optimization of sample contribution weights. Experimental results on four low-resource agglutinative language tasks demonstrate significant performance improvements (BLEU +2.1–3.4) across mainstream NMT architectures. Architecture-agnostic validation further confirms the framework’s generalizability.
%R 10.18653/v1/2025.acl-srw.33
%U https://aclanthology.org/2025.acl-srw.33/
%U https://doi.org/10.18653/v1/2025.acl-srw.33
%P 508-516
Markdown (Informal)
[Neural Machine Translation for Agglutinative Languages via Data Rejuvenation](https://aclanthology.org/2025.acl-srw.33/) (Zhao et al., ACL 2025)
ACL