@inproceedings{dmitrieva-konovalova-2023-creating,
title = "Creating a parallel {F}innish-{E}asy {F}innish dataset from news articles",
author = "Dmitrieva, Anna and
Konovalova, Aleksandra",
editor = {Espl{\`a}-Gomis, Miquel and
Forcada, Mikel L. and
Kuzman, Taja and
Ljube{\v{s}}i{\'c}, Nikola and
van Noord, Rik and
Ram{\'\i}rez-S{\'a}nchez, Gema and
Tiedemann, J{\"o}rg and
Toral, Antonio},
booktitle = "Proceedings of the 1st Workshop on Open Community-Driven Machine Translation",
month = jun,
year = "2023",
address = "Tampere, Finland",
publisher = "European Association for Machine Translation",
url = "https://aclanthology.org/2023.crowdmt-1.3",
pages = "21--26",
abstract = "Modern natural language processing tasks such as text simplification or summarization are typically formulated as monolingual machine translation tasks. This requires appropriate datasets to train, tune, and evaluate the models. This paper describes the creation of a parallel Finnish-Easy Finnish dataset from the Yle News archives. The dataset contains 1919 manually verified pairs of articles, each containing an article in Easy Finnish (selkosuomi) and a corresponding article from Standard Finnish news. Standard Finnish texts total 687555 words, and Easy Finnish texts have 106733 words. This new aligned resource was created automatically based on the Yle News archives from the Language Bank of Finland (Kielipankki) and manually checked by a human expert. The dataset is available for download from Kielipankki. This resource will allow for more effective Easy Language research and for creating applications for automatic simplification and/or summarization of Finnish texts.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dmitrieva-konovalova-2023-creating">
<titleInfo>
<title>Creating a parallel Finnish-Easy Finnish dataset from news articles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Dmitrieva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksandra</namePart>
<namePart type="family">Konovalova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Open Community-Driven Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miquel</namePart>
<namePart type="family">Esplà-Gomis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taja</namePart>
<namePart type="family">Kuzman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rik</namePart>
<namePart type="family">van Noord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation</publisher>
<place>
<placeTerm type="text">Tampere, Finland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Modern natural language processing tasks such as text simplification or summarization are typically formulated as monolingual machine translation tasks. This requires appropriate datasets to train, tune, and evaluate the models. This paper describes the creation of a parallel Finnish-Easy Finnish dataset from the Yle News archives. The dataset contains 1919 manually verified pairs of articles, each containing an article in Easy Finnish (selkosuomi) and a corresponding article from Standard Finnish news. Standard Finnish texts total 687555 words, and Easy Finnish texts have 106733 words. This new aligned resource was created automatically based on the Yle News archives from the Language Bank of Finland (Kielipankki) and manually checked by a human expert. The dataset is available for download from Kielipankki. This resource will allow for more effective Easy Language research and for creating applications for automatic simplification and/or summarization of Finnish texts.</abstract>
<identifier type="citekey">dmitrieva-konovalova-2023-creating</identifier>
<location>
<url>https://aclanthology.org/2023.crowdmt-1.3</url>
</location>
<part>
<date>2023-06</date>
<extent unit="page">
<start>21</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Creating a parallel Finnish-Easy Finnish dataset from news articles
%A Dmitrieva, Anna
%A Konovalova, Aleksandra
%Y Esplà-Gomis, Miquel
%Y Forcada, Mikel L.
%Y Kuzman, Taja
%Y Ljubešić, Nikola
%Y van Noord, Rik
%Y Ramírez-Sánchez, Gema
%Y Tiedemann, Jörg
%Y Toral, Antonio
%S Proceedings of the 1st Workshop on Open Community-Driven Machine Translation
%D 2023
%8 June
%I European Association for Machine Translation
%C Tampere, Finland
%F dmitrieva-konovalova-2023-creating
%X Modern natural language processing tasks such as text simplification or summarization are typically formulated as monolingual machine translation tasks. This requires appropriate datasets to train, tune, and evaluate the models. This paper describes the creation of a parallel Finnish-Easy Finnish dataset from the Yle News archives. The dataset contains 1919 manually verified pairs of articles, each containing an article in Easy Finnish (selkosuomi) and a corresponding article from Standard Finnish news. Standard Finnish texts total 687555 words, and Easy Finnish texts have 106733 words. This new aligned resource was created automatically based on the Yle News archives from the Language Bank of Finland (Kielipankki) and manually checked by a human expert. The dataset is available for download from Kielipankki. This resource will allow for more effective Easy Language research and for creating applications for automatic simplification and/or summarization of Finnish texts.
%U https://aclanthology.org/2023.crowdmt-1.3
%P 21-26
Markdown (Informal)
[Creating a parallel Finnish-Easy Finnish dataset from news articles](https://aclanthology.org/2023.crowdmt-1.3) (Dmitrieva & Konovalova, CrowdMT 2023)
ACL