@inproceedings{tanigawa-etal-2024-analysis,
title = "Analysis on Unsupervised Acquisition Process of Bilingual Vocabulary through Iterative Back-Translation",
author = "Tanigawa, Takuma and
Akiba, Tomoyosi and
Tsukada, Hajime",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.80",
pages = "887--892",
abstract = "In this paper, we investigate how new bilingual vocabulary is acquired through Iterative Back-Translation (IBT), which is known as a data augmentation method for machine translation from monolingual data of both source and target languages. To reveal the acquisition process, we first identify the word translation pairs in test data that do not exist in a bilingual data but do only in two monolingual data, then observe how many pairs are successfully translated by the translation model trained through IBT. We experimented on it with domain adaptation settings on two language pairs. Our experimental evaluation showed that more than 60{\%} of the new bilingual vocabulary is successfully acquired through IBT along with the improvement in the translation quality in terms of BLEU. It also revealed that new bilingual vocabulary was gradually acquired by repeating IBT iterations. From the results, we present our hypothesis on the process of new bilingual vocabulary acquisition where the context of the words plays a critical role in the success of the acquisition.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tanigawa-etal-2024-analysis">
<titleInfo>
<title>Analysis on Unsupervised Acquisition Process of Bilingual Vocabulary through Iterative Back-Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Takuma</namePart>
<namePart type="family">Tanigawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomoyosi</namePart>
<namePart type="family">Akiba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hajime</namePart>
<namePart type="family">Tsukada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we investigate how new bilingual vocabulary is acquired through Iterative Back-Translation (IBT), which is known as a data augmentation method for machine translation from monolingual data of both source and target languages. To reveal the acquisition process, we first identify the word translation pairs in test data that do not exist in a bilingual data but do only in two monolingual data, then observe how many pairs are successfully translated by the translation model trained through IBT. We experimented on it with domain adaptation settings on two language pairs. Our experimental evaluation showed that more than 60% of the new bilingual vocabulary is successfully acquired through IBT along with the improvement in the translation quality in terms of BLEU. It also revealed that new bilingual vocabulary was gradually acquired by repeating IBT iterations. From the results, we present our hypothesis on the process of new bilingual vocabulary acquisition where the context of the words plays a critical role in the success of the acquisition.</abstract>
<identifier type="citekey">tanigawa-etal-2024-analysis</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.80</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>887</start>
<end>892</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Analysis on Unsupervised Acquisition Process of Bilingual Vocabulary through Iterative Back-Translation
%A Tanigawa, Takuma
%A Akiba, Tomoyosi
%A Tsukada, Hajime
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F tanigawa-etal-2024-analysis
%X In this paper, we investigate how new bilingual vocabulary is acquired through Iterative Back-Translation (IBT), which is known as a data augmentation method for machine translation from monolingual data of both source and target languages. To reveal the acquisition process, we first identify the word translation pairs in test data that do not exist in a bilingual data but do only in two monolingual data, then observe how many pairs are successfully translated by the translation model trained through IBT. We experimented on it with domain adaptation settings on two language pairs. Our experimental evaluation showed that more than 60% of the new bilingual vocabulary is successfully acquired through IBT along with the improvement in the translation quality in terms of BLEU. It also revealed that new bilingual vocabulary was gradually acquired by repeating IBT iterations. From the results, we present our hypothesis on the process of new bilingual vocabulary acquisition where the context of the words plays a critical role in the success of the acquisition.
%U https://aclanthology.org/2024.lrec-main.80
%P 887-892
Markdown (Informal)
[Analysis on Unsupervised Acquisition Process of Bilingual Vocabulary through Iterative Back-Translation](https://aclanthology.org/2024.lrec-main.80) (Tanigawa et al., LREC-COLING 2024)
ACL