@inproceedings{arefyev-etal-2024-hplts,
title = "{HPLT}{'}s First Release of Data and Models",
author = {Arefyev, Nikolay and
Aulamo, Mikko and
Chen, Pinzhen and
De Gibert Bonet, Ona and
Haddow, Barry and
Helcl, Jind{\v{r}}ich and
Malik, Bhavitvya and
Ram{\'\i}rez-S{\'a}nchez, Gema and
Stepachev, Pavel and
Tiedemann, J{\"o}rg and
Vari{\v{s}}, Du{\v{s}}an and
Zaragoza-Bernabeu, Jaume},
editor = "Scarton, Carolina and
Prescott, Charlotte and
Bayliss, Chris and
Oakley, Chris and
Wright, Joanna and
Wrigley, Stuart and
Song, Xingyi and
Gow-Smith, Edward and
Forcada, Mikel and
Moniz, Helena",
booktitle = "Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2)",
month = jun,
year = "2024",
address = "Sheffield, UK",
publisher = "European Association for Machine Translation (EAMT)",
url = "https://aclanthology.org/2024.eamt-2.27",
pages = "53--54",
abstract = "The High Performance Language Technologies (HPLT) project is a 3-year EU-funded project that started in September 2022. It aims to deliver free, sustainable, and reusable datasets, models, and workflows at scale using high-performance computing. We describe the first results of the project. The data release includes monolingual data in 75 languages at 5.6T tokens and parallel data in 18 language pairs at 96M pairs, derived from 1.8 petabytes of web crawls. Building upon automated and transparent pipelines, the first machine translation (MT) models as well as large language models (LLMs) have been trained and released. Multiple data processing tools and pipelines have also been made public.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arefyev-etal-2024-hplts">
<titleInfo>
<title>HPLT’s First Release of Data and Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Arefyev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikko</namePart>
<namePart type="family">Aulamo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">De Gibert Bonet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindřich</namePart>
<namePart type="family">Helcl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhavitvya</namePart>
<namePart type="family">Malik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gema</namePart>
<namePart type="family">Ramírez-Sánchez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Stepachev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dušan</namePart>
<namePart type="family">Variš</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaume</namePart>
<namePart type="family">Zaragoza-Bernabeu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Scarton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Prescott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Bayliss</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Oakley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joanna</namePart>
<namePart type="family">Wright</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stuart</namePart>
<namePart type="family">Wrigley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingyi</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edward</namePart>
<namePart type="family">Gow-Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="family">Forcada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Moniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Association for Machine Translation (EAMT)</publisher>
<place>
<placeTerm type="text">Sheffield, UK</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The High Performance Language Technologies (HPLT) project is a 3-year EU-funded project that started in September 2022. It aims to deliver free, sustainable, and reusable datasets, models, and workflows at scale using high-performance computing. We describe the first results of the project. The data release includes monolingual data in 75 languages at 5.6T tokens and parallel data in 18 language pairs at 96M pairs, derived from 1.8 petabytes of web crawls. Building upon automated and transparent pipelines, the first machine translation (MT) models as well as large language models (LLMs) have been trained and released. Multiple data processing tools and pipelines have also been made public.</abstract>
<identifier type="citekey">arefyev-etal-2024-hplts</identifier>
<location>
<url>https://aclanthology.org/2024.eamt-2.27</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>53</start>
<end>54</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HPLT’s First Release of Data and Models
%A Arefyev, Nikolay
%A Aulamo, Mikko
%A Chen, Pinzhen
%A De Gibert Bonet, Ona
%A Haddow, Barry
%A Helcl, Jindřich
%A Malik, Bhavitvya
%A Ramírez-Sánchez, Gema
%A Stepachev, Pavel
%A Tiedemann, Jörg
%A Variš, Dušan
%A Zaragoza-Bernabeu, Jaume
%Y Scarton, Carolina
%Y Prescott, Charlotte
%Y Bayliss, Chris
%Y Oakley, Chris
%Y Wright, Joanna
%Y Wrigley, Stuart
%Y Song, Xingyi
%Y Gow-Smith, Edward
%Y Forcada, Mikel
%Y Moniz, Helena
%S Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2)
%D 2024
%8 June
%I European Association for Machine Translation (EAMT)
%C Sheffield, UK
%F arefyev-etal-2024-hplts
%X The High Performance Language Technologies (HPLT) project is a 3-year EU-funded project that started in September 2022. It aims to deliver free, sustainable, and reusable datasets, models, and workflows at scale using high-performance computing. We describe the first results of the project. The data release includes monolingual data in 75 languages at 5.6T tokens and parallel data in 18 language pairs at 96M pairs, derived from 1.8 petabytes of web crawls. Building upon automated and transparent pipelines, the first machine translation (MT) models as well as large language models (LLMs) have been trained and released. Multiple data processing tools and pipelines have also been made public.
%U https://aclanthology.org/2024.eamt-2.27
%P 53-54
Markdown (Informal)
[HPLT’s First Release of Data and Models](https://aclanthology.org/2024.eamt-2.27) (Arefyev et al., EAMT 2024)
ACL
- Nikolay Arefyev, Mikko Aulamo, Pinzhen Chen, Ona De Gibert Bonet, Barry Haddow, Jindřich Helcl, Bhavitvya Malik, Gema Ramírez-Sánchez, Pavel Stepachev, Jörg Tiedemann, Dušan Variš, and Jaume Zaragoza-Bernabeu. 2024. HPLT’s First Release of Data and Models. In Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 2), pages 53–54, Sheffield, UK. European Association for Machine Translation (EAMT).