@article{pang-etal-2024-rethinking,
title = "Rethinking the Exploitation of Monolingual Data for Low-Resource Neural Machine Translation",
author = "Pang, Jianhui and
Yang*, Baosong and
Wong*, Derek Fai and
Wan, Yu and
Liu, Dayiheng and
Chao, Lidia Sam and
Xie, Jun",
journal = "Computational Linguistics",
volume = "50",
number = "1",
month = mar,
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2024.cl-1.2",
doi = "10.1162/coli_a_00496",
pages = "25--47",
abstract = "The utilization of monolingual data has been shown to be a promising strategy for addressing low-resource machine translation problems. Previous studies have demonstrated the effectiveness of techniques such as back-translation and self-supervised objectives, including masked language modeling, causal language modeling, and denoise autoencoding, in improving the performance of machine translation models. However, the manner in which these methods contribute to the success of machine translation tasks and how they can be effectively combined remains an under-researched area. In this study, we carry out a systematic investigation of the effects of these techniques on linguistic properties through the use of probing tasks, including source language comprehension, bilingual word alignment, and translation fluency. We further evaluate the impact of pre-training, back-translation, and multi-task learning on bitexts of varying sizes. Our findings inform the design of more effective pipelines for leveraging monolingual data in extremely low-resource and low-resource machine translation tasks. Experiment results show consistent performance gains in seven translation directions, which provide further support for our conclusions and understanding of the role of monolingual data in machine translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pang-etal-2024-rethinking">
<titleInfo>
<title>Rethinking the Exploitation of Monolingual Data for Low-Resource Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianhui</namePart>
<namePart type="family">Pang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baosong</namePart>
<namePart type="family">Yang*</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">Fai</namePart>
<namePart type="family">Wong*</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayiheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lidia</namePart>
<namePart type="given">Sam</namePart>
<namePart type="family">Chao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>The utilization of monolingual data has been shown to be a promising strategy for addressing low-resource machine translation problems. Previous studies have demonstrated the effectiveness of techniques such as back-translation and self-supervised objectives, including masked language modeling, causal language modeling, and denoise autoencoding, in improving the performance of machine translation models. However, the manner in which these methods contribute to the success of machine translation tasks and how they can be effectively combined remains an under-researched area. In this study, we carry out a systematic investigation of the effects of these techniques on linguistic properties through the use of probing tasks, including source language comprehension, bilingual word alignment, and translation fluency. We further evaluate the impact of pre-training, back-translation, and multi-task learning on bitexts of varying sizes. Our findings inform the design of more effective pipelines for leveraging monolingual data in extremely low-resource and low-resource machine translation tasks. Experiment results show consistent performance gains in seven translation directions, which provide further support for our conclusions and understanding of the role of monolingual data in machine translation.</abstract>
<identifier type="citekey">pang-etal-2024-rethinking</identifier>
<identifier type="doi">10.1162/coli_a_00496</identifier>
<location>
<url>https://aclanthology.org/2024.cl-1.2</url>
</location>
<part>
<date>2024-03</date>
<detail type="volume"><number>50</number></detail>
<detail type="issue"><number>1</number></detail>
<extent unit="page">
<start>25</start>
<end>47</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Rethinking the Exploitation of Monolingual Data for Low-Resource Neural Machine Translation
%A Pang, Jianhui
%A Yang*, Baosong
%A Wong*, Derek Fai
%A Wan, Yu
%A Liu, Dayiheng
%A Chao, Lidia Sam
%A Xie, Jun
%J Computational Linguistics
%D 2024
%8 March
%V 50
%N 1
%I MIT Press
%C Cambridge, MA
%F pang-etal-2024-rethinking
%X The utilization of monolingual data has been shown to be a promising strategy for addressing low-resource machine translation problems. Previous studies have demonstrated the effectiveness of techniques such as back-translation and self-supervised objectives, including masked language modeling, causal language modeling, and denoise autoencoding, in improving the performance of machine translation models. However, the manner in which these methods contribute to the success of machine translation tasks and how they can be effectively combined remains an under-researched area. In this study, we carry out a systematic investigation of the effects of these techniques on linguistic properties through the use of probing tasks, including source language comprehension, bilingual word alignment, and translation fluency. We further evaluate the impact of pre-training, back-translation, and multi-task learning on bitexts of varying sizes. Our findings inform the design of more effective pipelines for leveraging monolingual data in extremely low-resource and low-resource machine translation tasks. Experiment results show consistent performance gains in seven translation directions, which provide further support for our conclusions and understanding of the role of monolingual data in machine translation.
%R 10.1162/coli_a_00496
%U https://aclanthology.org/2024.cl-1.2
%U https://doi.org/10.1162/coli_a_00496
%P 25-47
Markdown (Informal)
[Rethinking the Exploitation of Monolingual Data for Low-Resource Neural Machine Translation](https://aclanthology.org/2024.cl-1.2) (Pang et al., CL 2024)
ACL