@inproceedings{orlando-etal-2024-minerva,
title = "Minerva {LLM}s: The First Family of Large Language Models Trained from Scratch on {I}talian Data",
author = "Orlando, Riccardo and
Moroni, Luca and
Huguet Cabot, Pere-Llu{\'i}s and
Conia, Simone and
Barba, Edoardo and
Orlandini, Sergio and
Fiameni, Giuseppe and
Navigli, Roberto",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.77/",
pages = "707--719",
ISBN = "979-12-210-7060-6",
abstract = "The increasing popularity of Large Language Models (LLMs) has led to a surge in research on adapting existing models to different languages. However, the pretraining of non-English LLMs is still an underexplored area and there is no open-source endeavor that explores what is achievable with open Italian data. To address this issue, we present Minerva, the first family of LLMs trained from scratch on Italian data. The creation of Minerva is an opportunity to explore and investigate the pretraining of LLMs for the Italian language, outlining the challenges that arise when training LLMs with native Italian texts. Minerva demonstrates that an LLM for a specific language brings a number of practical benefits compared to the adaptation of an existing one, including deep control over the composition of the vocabulary and the training data. With this paper, we aim to provide a comprehensive overview of the design choices, results, and evaluation of our Minerva models, showing promising results on Italian benchmarks and downstream tasks. Most importantly, we share what we learned and the findings obtained during the development of Minerva, as we believe that our experience will be valuable for the academic and industrial communities interested in training non-English LLMs from scratch."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="orlando-etal-2024-minerva">
<titleInfo>
<title>Minerva LLMs: The First Family of Large Language Models Trained from Scratch on Italian Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Riccardo</namePart>
<namePart type="family">Orlando</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Moroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pere-Lluís</namePart>
<namePart type="family">Huguet Cabot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simone</namePart>
<namePart type="family">Conia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="family">Barba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Orlandini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Fiameni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>The increasing popularity of Large Language Models (LLMs) has led to a surge in research on adapting existing models to different languages. However, the pretraining of non-English LLMs is still an underexplored area and there is no open-source endeavor that explores what is achievable with open Italian data. To address this issue, we present Minerva, the first family of LLMs trained from scratch on Italian data. The creation of Minerva is an opportunity to explore and investigate the pretraining of LLMs for the Italian language, outlining the challenges that arise when training LLMs with native Italian texts. Minerva demonstrates that an LLM for a specific language brings a number of practical benefits compared to the adaptation of an existing one, including deep control over the composition of the vocabulary and the training data. With this paper, we aim to provide a comprehensive overview of the design choices, results, and evaluation of our Minerva models, showing promising results on Italian benchmarks and downstream tasks. Most importantly, we share what we learned and the findings obtained during the development of Minerva, as we believe that our experience will be valuable for the academic and industrial communities interested in training non-English LLMs from scratch.</abstract>
<identifier type="citekey">orlando-etal-2024-minerva</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.77/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>707</start>
<end>719</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Minerva LLMs: The First Family of Large Language Models Trained from Scratch on Italian Data
%A Orlando, Riccardo
%A Moroni, Luca
%A Huguet Cabot, Pere-Lluís
%A Conia, Simone
%A Barba, Edoardo
%A Orlandini, Sergio
%A Fiameni, Giuseppe
%A Navigli, Roberto
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F orlando-etal-2024-minerva
%X The increasing popularity of Large Language Models (LLMs) has led to a surge in research on adapting existing models to different languages. However, the pretraining of non-English LLMs is still an underexplored area and there is no open-source endeavor that explores what is achievable with open Italian data. To address this issue, we present Minerva, the first family of LLMs trained from scratch on Italian data. The creation of Minerva is an opportunity to explore and investigate the pretraining of LLMs for the Italian language, outlining the challenges that arise when training LLMs with native Italian texts. Minerva demonstrates that an LLM for a specific language brings a number of practical benefits compared to the adaptation of an existing one, including deep control over the composition of the vocabulary and the training data. With this paper, we aim to provide a comprehensive overview of the design choices, results, and evaluation of our Minerva models, showing promising results on Italian benchmarks and downstream tasks. Most importantly, we share what we learned and the findings obtained during the development of Minerva, as we believe that our experience will be valuable for the academic and industrial communities interested in training non-English LLMs from scratch.
%U https://aclanthology.org/2024.clicit-1.77/
%P 707-719
Markdown (Informal)
[Minerva LLMs: The First Family of Large Language Models Trained from Scratch on Italian Data](https://aclanthology.org/2024.clicit-1.77/) (Orlando et al., CLiC-it 2024)
ACL
- Riccardo Orlando, Luca Moroni, Pere-Lluís Huguet Cabot, Simone Conia, Edoardo Barba, Sergio Orlandini, Giuseppe Fiameni, and Roberto Navigli. 2024. Minerva LLMs: The First Family of Large Language Models Trained from Scratch on Italian Data. In Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024), pages 707–719, Pisa, Italy. CEUR Workshop Proceedings.