@inproceedings{moroni-etal-2024-towards,
title = "Towards a More Comprehensive Evaluation for {I}talian {LLM}s",
author = "Moroni, Luca and
Conia, Simone and
Martelli, Federico and
Navigli, Roberto",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.67/",
pages = "584--599",
ISBN = "979-12-210-7060-6",
abstract = "Recent Large Language Models (LLMs) have shown impressive performance in addressing complex aspects of human language. These models have also demonstrated significant capabilities in processing and generating Italian text, achieving state-of-the-art results on current benchmarks for the Italian language. However, the number of such benchmarks is still insufficient. A case in point is the {\textquotedblleft}Open Ita LLM Leaderboard{\textquotedblright} which only supports three benchmarks, despite being one of the most popular evaluation suite for the evaluation of Italian-speaking LLMs. In this paper, we analyze the current pitfalls of existing evaluation suites and propose two ways to this gap: i) a new suite of automatically-translated benchmarks, drawn from the most popular English benchmarks; and ii) the adaptation of existing manual dataset so that they can be used to complement the evaluation of Italian LLMs. We discuss the pros and cons of both approaches and release all our data to foster further research on the evaluation of Italian-speaking LLMs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moroni-etal-2024-towards">
<titleInfo>
<title>Towards a More Comprehensive Evaluation for Italian LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luca</namePart>
<namePart type="family">Moroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simone</namePart>
<namePart type="family">Conia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Martelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>Recent Large Language Models (LLMs) have shown impressive performance in addressing complex aspects of human language. These models have also demonstrated significant capabilities in processing and generating Italian text, achieving state-of-the-art results on current benchmarks for the Italian language. However, the number of such benchmarks is still insufficient. A case in point is the “Open Ita LLM Leaderboard” which only supports three benchmarks, despite being one of the most popular evaluation suite for the evaluation of Italian-speaking LLMs. In this paper, we analyze the current pitfalls of existing evaluation suites and propose two ways to this gap: i) a new suite of automatically-translated benchmarks, drawn from the most popular English benchmarks; and ii) the adaptation of existing manual dataset so that they can be used to complement the evaluation of Italian LLMs. We discuss the pros and cons of both approaches and release all our data to foster further research on the evaluation of Italian-speaking LLMs.</abstract>
<identifier type="citekey">moroni-etal-2024-towards</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.67/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>584</start>
<end>599</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards a More Comprehensive Evaluation for Italian LLMs
%A Moroni, Luca
%A Conia, Simone
%A Martelli, Federico
%A Navigli, Roberto
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F moroni-etal-2024-towards
%X Recent Large Language Models (LLMs) have shown impressive performance in addressing complex aspects of human language. These models have also demonstrated significant capabilities in processing and generating Italian text, achieving state-of-the-art results on current benchmarks for the Italian language. However, the number of such benchmarks is still insufficient. A case in point is the “Open Ita LLM Leaderboard” which only supports three benchmarks, despite being one of the most popular evaluation suite for the evaluation of Italian-speaking LLMs. In this paper, we analyze the current pitfalls of existing evaluation suites and propose two ways to this gap: i) a new suite of automatically-translated benchmarks, drawn from the most popular English benchmarks; and ii) the adaptation of existing manual dataset so that they can be used to complement the evaluation of Italian LLMs. We discuss the pros and cons of both approaches and release all our data to foster further research on the evaluation of Italian-speaking LLMs.
%U https://aclanthology.org/2024.clicit-1.67/
%P 584-599
Markdown (Informal)
[Towards a More Comprehensive Evaluation for Italian LLMs](https://aclanthology.org/2024.clicit-1.67/) (Moroni et al., CLiC-it 2024)
ACL