@inproceedings{pensa-etal-2024-gita4calamita,
title = "{GITA}4{CALAMITA} - Evaluating the Physical Commonsense Understanding of {I}talian {LLM}s in a Multi-layered Approach: A {CALAMITA} Challenge",
author = "Pensa, Giulia and
Azurmendi, Ekhi and
Etxaniz, Julen and
Altuna, Bego{\~n}a and
Gonzalez-Dios, Itziar",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.127/",
pages = "1153--1160",
ISBN = "979-12-210-7060-6",
abstract = "In the context of the CALAMITA Challenge, we investigate the physical commonsense reasoning capabilities of large language models (LLMs) and introduce a methodology to assess their low-level understanding of the physical world. To this end, we use a test set designed to evaluate physical commonsense reasoning in LLMs for the Italian language. We present a tiered dataset, named the Graded Italian Annotated dataset (GITA), which is written and annotated by a professional linguist. This dataset enables us to focus on three distinct levels of commonsense understanding. Our benchmark aims to evaluate three specific tasks: identifying plausible and implausible stories within our dataset, identifying the conflict that generates an implausible story, and identifying the physical states that make a story implausible. We perform these tasks using LLAMA3, and Gemma. Our findings reveal that, although the models may excel at high-level classification tasks, their reasoning is inconsistent and unverifiable, as they fail to capture intermediate evidence."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pensa-etal-2024-gita4calamita">
<titleInfo>
<title>GITA4CALAMITA - Evaluating the Physical Commonsense Understanding of Italian LLMs in a Multi-layered Approach: A CALAMITA Challenge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giulia</namePart>
<namePart type="family">Pensa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekhi</namePart>
<namePart type="family">Azurmendi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julen</namePart>
<namePart type="family">Etxaniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Begoña</namePart>
<namePart type="family">Altuna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itziar</namePart>
<namePart type="family">Gonzalez-Dios</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>In the context of the CALAMITA Challenge, we investigate the physical commonsense reasoning capabilities of large language models (LLMs) and introduce a methodology to assess their low-level understanding of the physical world. To this end, we use a test set designed to evaluate physical commonsense reasoning in LLMs for the Italian language. We present a tiered dataset, named the Graded Italian Annotated dataset (GITA), which is written and annotated by a professional linguist. This dataset enables us to focus on three distinct levels of commonsense understanding. Our benchmark aims to evaluate three specific tasks: identifying plausible and implausible stories within our dataset, identifying the conflict that generates an implausible story, and identifying the physical states that make a story implausible. We perform these tasks using LLAMA3, and Gemma. Our findings reveal that, although the models may excel at high-level classification tasks, their reasoning is inconsistent and unverifiable, as they fail to capture intermediate evidence.</abstract>
<identifier type="citekey">pensa-etal-2024-gita4calamita</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.127/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>1153</start>
<end>1160</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GITA4CALAMITA - Evaluating the Physical Commonsense Understanding of Italian LLMs in a Multi-layered Approach: A CALAMITA Challenge
%A Pensa, Giulia
%A Azurmendi, Ekhi
%A Etxaniz, Julen
%A Altuna, Begoña
%A Gonzalez-Dios, Itziar
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F pensa-etal-2024-gita4calamita
%X In the context of the CALAMITA Challenge, we investigate the physical commonsense reasoning capabilities of large language models (LLMs) and introduce a methodology to assess their low-level understanding of the physical world. To this end, we use a test set designed to evaluate physical commonsense reasoning in LLMs for the Italian language. We present a tiered dataset, named the Graded Italian Annotated dataset (GITA), which is written and annotated by a professional linguist. This dataset enables us to focus on three distinct levels of commonsense understanding. Our benchmark aims to evaluate three specific tasks: identifying plausible and implausible stories within our dataset, identifying the conflict that generates an implausible story, and identifying the physical states that make a story implausible. We perform these tasks using LLAMA3, and Gemma. Our findings reveal that, although the models may excel at high-level classification tasks, their reasoning is inconsistent and unverifiable, as they fail to capture intermediate evidence.
%U https://aclanthology.org/2024.clicit-1.127/
%P 1153-1160
Markdown (Informal)
[GITA4CALAMITA - Evaluating the Physical Commonsense Understanding of Italian LLMs in a Multi-layered Approach: A CALAMITA Challenge](https://aclanthology.org/2024.clicit-1.127/) (Pensa et al., CLiC-it 2024)
ACL