@inproceedings{musacchio-etal-2024-study,
title = "A Study on the Soundness of Closed-ended Evaluation of Large Language Models Adapted to the {I}talian Language",
author = "Musacchio, Elio and
Siciliani, Lucia and
Basile, Pierpaolo and
Michielon, Edoardo and
Pasqualini, Marco and
Uboldi, Asia Beatrice and
Semeraro, Giovanni",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.68/",
pages = "600--611",
ISBN = "979-12-210-7060-6",
abstract = "With the rising interest in Large Language Models, deep architectures capable of solving a wide range of Natural LanguageGeneration tasks, an increasing number of open weights architectures have been developed and released online. In contrastwith older architectures, which were aimed at solving specific linguistic assignments, Large Language Models have shownoutstanding capabilities in solving several tasks at once, raising the question of whether they can truly comprehend naturallanguage. Nevertheless, evaluating this kind of capability is far from easy. One of the proposed solutions so far is usingbenchmarks that combine various types of tasks. This approach is based on the premise that achieving good performance ineach of these individual tasks can imply having developed a model capable of understanding language. However, while thisassumption is not incorrect, it is evident that it is not sufficient, and the evaluation of Large Language Models still remains anopen challenge. In this paper, we conduct a study aimed at highlighting the potential and limitations of current datasets andhow a new evaluation setting applied to language-adapted Large Language Models may provide more insight than traditionalapproaches."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="musacchio-etal-2024-study">
<titleInfo>
<title>A Study on the Soundness of Closed-ended Evaluation of Large Language Models Adapted to the Italian Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elio</namePart>
<namePart type="family">Musacchio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Siciliani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierpaolo</namePart>
<namePart type="family">Basile</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edoardo</namePart>
<namePart type="family">Michielon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Pasqualini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asia</namePart>
<namePart type="given">Beatrice</namePart>
<namePart type="family">Uboldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Semeraro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>With the rising interest in Large Language Models, deep architectures capable of solving a wide range of Natural LanguageGeneration tasks, an increasing number of open weights architectures have been developed and released online. In contrastwith older architectures, which were aimed at solving specific linguistic assignments, Large Language Models have shownoutstanding capabilities in solving several tasks at once, raising the question of whether they can truly comprehend naturallanguage. Nevertheless, evaluating this kind of capability is far from easy. One of the proposed solutions so far is usingbenchmarks that combine various types of tasks. This approach is based on the premise that achieving good performance ineach of these individual tasks can imply having developed a model capable of understanding language. However, while thisassumption is not incorrect, it is evident that it is not sufficient, and the evaluation of Large Language Models still remains anopen challenge. In this paper, we conduct a study aimed at highlighting the potential and limitations of current datasets andhow a new evaluation setting applied to language-adapted Large Language Models may provide more insight than traditionalapproaches.</abstract>
<identifier type="citekey">musacchio-etal-2024-study</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.68/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>600</start>
<end>611</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Study on the Soundness of Closed-ended Evaluation of Large Language Models Adapted to the Italian Language
%A Musacchio, Elio
%A Siciliani, Lucia
%A Basile, Pierpaolo
%A Michielon, Edoardo
%A Pasqualini, Marco
%A Uboldi, Asia Beatrice
%A Semeraro, Giovanni
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F musacchio-etal-2024-study
%X With the rising interest in Large Language Models, deep architectures capable of solving a wide range of Natural LanguageGeneration tasks, an increasing number of open weights architectures have been developed and released online. In contrastwith older architectures, which were aimed at solving specific linguistic assignments, Large Language Models have shownoutstanding capabilities in solving several tasks at once, raising the question of whether they can truly comprehend naturallanguage. Nevertheless, evaluating this kind of capability is far from easy. One of the proposed solutions so far is usingbenchmarks that combine various types of tasks. This approach is based on the premise that achieving good performance ineach of these individual tasks can imply having developed a model capable of understanding language. However, while thisassumption is not incorrect, it is evident that it is not sufficient, and the evaluation of Large Language Models still remains anopen challenge. In this paper, we conduct a study aimed at highlighting the potential and limitations of current datasets andhow a new evaluation setting applied to language-adapted Large Language Models may provide more insight than traditionalapproaches.
%U https://aclanthology.org/2024.clicit-1.68/
%P 600-611
Markdown (Informal)
[A Study on the Soundness of Closed-ended Evaluation of Large Language Models Adapted to the Italian Language](https://aclanthology.org/2024.clicit-1.68/) (Musacchio et al., CLiC-it 2024)
ACL