@inproceedings{mikelenic-oliver-2024-using,
title = "Using a multilingual literary parallel corpus to train {NMT} systems",
author = "Mikeleni{\'c}, Bojana and
Oliver, Antoni",
editor = "Vanroy, Bram and
Lefer, Marie-Aude and
Macken, Lieve and
Ruffo, Paola",
booktitle = "Proceedings of the 1st Workshop on Creative-text Translation and Technology",
month = jun,
year = "2024",
address = "Sheffield, United Kingdom",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2024.ctt-1.1",
pages = "1--9",
abstract = "This article presents an application of a multilingual and multidirectional parallel corpus composed of literary texts in five Romance languages (Spanish, French, Italian, Portuguese, Romanian) and a Slavic language (Croatian), with a total of 142,000 segments and 15.7 million words. After combining it with very large freely available parallel corpora, this resource is used to train NMT systems tailored to literature. A total of five NMT systems have been trained: Spanish-French, Spanish-Italian, Spanish-Portuguese, Spanish-Romanian and Spanish-Croatian. The trained systems were evaluated using automatic metrics (BLEU, chrF2 and TER) and a comparison with a rule-based MT system (Apertium) and a neural system (Google Translate) is presented. As a main conclusion, we can highlight that the use of this literary corpus has been very productive, as the majority of the trained systems achieve comparable, and in some cases even better, values of the automatic quality metrics than a widely used commercial NMT system.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mikelenic-oliver-2024-using">
<titleInfo>
<title>Using a multilingual literary parallel corpus to train NMT systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bojana</namePart>
<namePart type="family">Mikelenić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antoni</namePart>
<namePart type="family">Oliver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Creative-text Translation and Technology</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bram</namePart>
<namePart type="family">Vanroy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie-Aude</namePart>
<namePart type="family">Lefer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lieve</namePart>
<namePart type="family">Macken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Ruffo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Sheffield, United Kingdom</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This article presents an application of a multilingual and multidirectional parallel corpus composed of literary texts in five Romance languages (Spanish, French, Italian, Portuguese, Romanian) and a Slavic language (Croatian), with a total of 142,000 segments and 15.7 million words. After combining it with very large freely available parallel corpora, this resource is used to train NMT systems tailored to literature. A total of five NMT systems have been trained: Spanish-French, Spanish-Italian, Spanish-Portuguese, Spanish-Romanian and Spanish-Croatian. The trained systems were evaluated using automatic metrics (BLEU, chrF2 and TER) and a comparison with a rule-based MT system (Apertium) and a neural system (Google Translate) is presented. As a main conclusion, we can highlight that the use of this literary corpus has been very productive, as the majority of the trained systems achieve comparable, and in some cases even better, values of the automatic quality metrics than a widely used commercial NMT system.</abstract>
<identifier type="citekey">mikelenic-oliver-2024-using</identifier>
<location>
<url>https://aclanthology.org/2024.ctt-1.1</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>1</start>
<end>9</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Using a multilingual literary parallel corpus to train NMT systems
%A Mikelenić, Bojana
%A Oliver, Antoni
%Y Vanroy, Bram
%Y Lefer, Marie-Aude
%Y Macken, Lieve
%Y Ruffo, Paola
%S Proceedings of the 1st Workshop on Creative-text Translation and Technology
%D 2024
%8 June
%I European Association for Machine Translation
%C Sheffield, United Kingdom
%F mikelenic-oliver-2024-using
%X This article presents an application of a multilingual and multidirectional parallel corpus composed of literary texts in five Romance languages (Spanish, French, Italian, Portuguese, Romanian) and a Slavic language (Croatian), with a total of 142,000 segments and 15.7 million words. After combining it with very large freely available parallel corpora, this resource is used to train NMT systems tailored to literature. A total of five NMT systems have been trained: Spanish-French, Spanish-Italian, Spanish-Portuguese, Spanish-Romanian and Spanish-Croatian. The trained systems were evaluated using automatic metrics (BLEU, chrF2 and TER) and a comparison with a rule-based MT system (Apertium) and a neural system (Google Translate) is presented. As a main conclusion, we can highlight that the use of this literary corpus has been very productive, as the majority of the trained systems achieve comparable, and in some cases even better, values of the automatic quality metrics than a widely used commercial NMT system.
%U https://aclanthology.org/2024.ctt-1.1
%P 1-9
Markdown (Informal)
[Using a multilingual literary parallel corpus to train NMT systems](https://aclanthology.org/2024.ctt-1.1) (Mikelenić & Oliver, CTT-WS 2024)
ACL