@inproceedings{kondo-etal-2021-sentence,
title = "Sentence Concatenation Approach to Data Augmentation for Neural Machine Translation",
author = "Kondo, Seiichiro and
Hotate, Kengo and
Hirasawa, Tosho and
Kaneko, Masahiro and
Komachi, Mamoru",
editor = "Durmus, Esin and
Gupta, Vivek and
Liu, Nelson and
Peng, Nanyun and
Su, Yu",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Student Research Workshop",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-srw.18",
doi = "10.18653/v1/2021.naacl-srw.18",
pages = "143--149",
abstract = "Recently, neural machine translation is widely used for its high translation accuracy, but it is also known to show poor performance at long sentence translation. Besides, this tendency appears prominently for low resource languages. We assume that these problems are caused by long sentences being few in the train data. Therefore, we propose a data augmentation method for handling long sentences. Our method is simple; we only use given parallel corpora as train data and generate long sentences by concatenating two sentences. Based on our experiments, we confirm improvements in long sentence translation by proposed data augmentation despite the simplicity. Moreover, the proposed method improves translation quality more when combined with back-translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kondo-etal-2021-sentence">
<titleInfo>
<title>Sentence Concatenation Approach to Data Augmentation for Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seiichiro</namePart>
<namePart type="family">Kondo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kengo</namePart>
<namePart type="family">Hotate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tosho</namePart>
<namePart type="family">Hirasawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masahiro</namePart>
<namePart type="family">Kaneko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Esin</namePart>
<namePart type="family">Durmus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nelson</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nanyun</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, neural machine translation is widely used for its high translation accuracy, but it is also known to show poor performance at long sentence translation. Besides, this tendency appears prominently for low resource languages. We assume that these problems are caused by long sentences being few in the train data. Therefore, we propose a data augmentation method for handling long sentences. Our method is simple; we only use given parallel corpora as train data and generate long sentences by concatenating two sentences. Based on our experiments, we confirm improvements in long sentence translation by proposed data augmentation despite the simplicity. Moreover, the proposed method improves translation quality more when combined with back-translation.</abstract>
<identifier type="citekey">kondo-etal-2021-sentence</identifier>
<identifier type="doi">10.18653/v1/2021.naacl-srw.18</identifier>
<location>
<url>https://aclanthology.org/2021.naacl-srw.18</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>143</start>
<end>149</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Sentence Concatenation Approach to Data Augmentation for Neural Machine Translation
%A Kondo, Seiichiro
%A Hotate, Kengo
%A Hirasawa, Tosho
%A Kaneko, Masahiro
%A Komachi, Mamoru
%Y Durmus, Esin
%Y Gupta, Vivek
%Y Liu, Nelson
%Y Peng, Nanyun
%Y Su, Yu
%S Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Student Research Workshop
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F kondo-etal-2021-sentence
%X Recently, neural machine translation is widely used for its high translation accuracy, but it is also known to show poor performance at long sentence translation. Besides, this tendency appears prominently for low resource languages. We assume that these problems are caused by long sentences being few in the train data. Therefore, we propose a data augmentation method for handling long sentences. Our method is simple; we only use given parallel corpora as train data and generate long sentences by concatenating two sentences. Based on our experiments, we confirm improvements in long sentence translation by proposed data augmentation despite the simplicity. Moreover, the proposed method improves translation quality more when combined with back-translation.
%R 10.18653/v1/2021.naacl-srw.18
%U https://aclanthology.org/2021.naacl-srw.18
%U https://doi.org/10.18653/v1/2021.naacl-srw.18
%P 143-149
Markdown (Informal)
[Sentence Concatenation Approach to Data Augmentation for Neural Machine Translation](https://aclanthology.org/2021.naacl-srw.18) (Kondo et al., NAACL 2021)
ACL