@inproceedings{puccetti-etal-2025-invalsi,
title = "The Invalsi Benchmarks: measuring the Linguistic and Mathematical understanding of Large Language Models in {I}talian",
author = "Puccetti, Giovanni and
Cassese, Maria and
Esuli, Andrea",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.453/",
pages = "6782--6797",
abstract = "While Italian is a high-resource language, there are few Italian-native benchmarks to evaluate generative Large Language Models (LLMs) in this language. This work presents three new benchmarks: Invalsi MATE to evaluate models performance on mathematical understanding in Italian, Invalsi ITA to evaluate language under standing in Italian and Olimpiadi MATE for more complex mathematical understanding. The first two benchmarks are based on the Invalsi tests, which are administered to students of age between 6 and 18 within the Italian school system and have been validated by several experts in teaching and pedagogy, the third one comes from the Italian highschool math Olympics. We evaluate 10 powerful language models on these benchmarks and we find that they are bound by 71{\%} accuracy on Invalsi MATE, achieved by Llama 3.1 70b instruct and by 88{\%} on Invalsi ITA. For both Invalsi MATE and Invalsi ITA we compare LLMs with the average performance of Italian students to show that Llama 3.1 is the only one to outperform them on Invalsi MATE while most models do so on Invalsi ITA, we then show that Olimpiadi MATE is more challenging than Invalsi MATE and the highest accuracy, achieved by Llama 3.1 405b instruct accuracy is 45{\%}."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="puccetti-etal-2025-invalsi">
<titleInfo>
<title>The Invalsi Benchmarks: measuring the Linguistic and Mathematical understanding of Large Language Models in Italian</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giovanni</namePart>
<namePart type="family">Puccetti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Cassese</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Esuli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While Italian is a high-resource language, there are few Italian-native benchmarks to evaluate generative Large Language Models (LLMs) in this language. This work presents three new benchmarks: Invalsi MATE to evaluate models performance on mathematical understanding in Italian, Invalsi ITA to evaluate language under standing in Italian and Olimpiadi MATE for more complex mathematical understanding. The first two benchmarks are based on the Invalsi tests, which are administered to students of age between 6 and 18 within the Italian school system and have been validated by several experts in teaching and pedagogy, the third one comes from the Italian highschool math Olympics. We evaluate 10 powerful language models on these benchmarks and we find that they are bound by 71% accuracy on Invalsi MATE, achieved by Llama 3.1 70b instruct and by 88% on Invalsi ITA. For both Invalsi MATE and Invalsi ITA we compare LLMs with the average performance of Italian students to show that Llama 3.1 is the only one to outperform them on Invalsi MATE while most models do so on Invalsi ITA, we then show that Olimpiadi MATE is more challenging than Invalsi MATE and the highest accuracy, achieved by Llama 3.1 405b instruct accuracy is 45%.</abstract>
<identifier type="citekey">puccetti-etal-2025-invalsi</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.453/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6782</start>
<end>6797</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Invalsi Benchmarks: measuring the Linguistic and Mathematical understanding of Large Language Models in Italian
%A Puccetti, Giovanni
%A Cassese, Maria
%A Esuli, Andrea
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F puccetti-etal-2025-invalsi
%X While Italian is a high-resource language, there are few Italian-native benchmarks to evaluate generative Large Language Models (LLMs) in this language. This work presents three new benchmarks: Invalsi MATE to evaluate models performance on mathematical understanding in Italian, Invalsi ITA to evaluate language under standing in Italian and Olimpiadi MATE for more complex mathematical understanding. The first two benchmarks are based on the Invalsi tests, which are administered to students of age between 6 and 18 within the Italian school system and have been validated by several experts in teaching and pedagogy, the third one comes from the Italian highschool math Olympics. We evaluate 10 powerful language models on these benchmarks and we find that they are bound by 71% accuracy on Invalsi MATE, achieved by Llama 3.1 70b instruct and by 88% on Invalsi ITA. For both Invalsi MATE and Invalsi ITA we compare LLMs with the average performance of Italian students to show that Llama 3.1 is the only one to outperform them on Invalsi MATE while most models do so on Invalsi ITA, we then show that Olimpiadi MATE is more challenging than Invalsi MATE and the highest accuracy, achieved by Llama 3.1 405b instruct accuracy is 45%.
%U https://aclanthology.org/2025.coling-main.453/
%P 6782-6797
Markdown (Informal)
[The Invalsi Benchmarks: measuring the Linguistic and Mathematical understanding of Large Language Models in Italian](https://aclanthology.org/2025.coling-main.453/) (Puccetti et al., COLING 2025)
ACL