@inproceedings{mercorio-etal-2024-beep,
title = "{BEEP} - {BE}st {D}riv{E}r`s License Performer: A {CALAMITA} Challenge",
author = "Mercorio, Fabio and
Potert{\`i}, Daniele and
Serino, Antonio and
Seveso, Andrea",
editor = "Dell'Orletta, Felice and
Lenci, Alessandro and
Montemagni, Simonetta and
Sprugnoli, Rachele",
booktitle = "Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)",
month = dec,
year = "2024",
address = "Pisa, Italy",
publisher = "CEUR Workshop Proceedings",
url = "https://aclanthology.org/2024.clicit-1.135/",
pages = "1222--1227",
ISBN = "979-12-210-7060-6",
abstract = "We present BEEP (BEst DrivEr`s License Performer), a benchmark challenge to evaluate large language models in the context of a simulated Italian driver`s license exam. This challenge tests the models' ability to understand and apply traffic laws, road safety regulations, and vehicle-related knowledge through a series of true/false questions. The dataset is derived from official ministerial materials used in the Italian licensing process, specifically targeting Category B licenses.We evaluate models such as LLaMA and Mixtral across multiple categories. In addition, we simulate a driving license test to assess the models' real-world applicability, where the pass rate is determined based on the number of errors allowed. While scaling up model size improved performance, even larger models struggled to pass the exam consistently. The challenge demonstrates the capabilities and limitations of LLMs in handling real-world, high-stakes scenarios, providing insights into their practical use and areas for further improvement."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mercorio-etal-2024-beep">
<titleInfo>
<title>BEEP - BEst DrivEr‘s License Performer: A CALAMITA Challenge</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Mercorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniele</namePart>
<namePart type="family">Potertì</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Serino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Seveso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felice</namePart>
<namePart type="family">Dell’Orletta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simonetta</namePart>
<namePart type="family">Montemagni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachele</namePart>
<namePart type="family">Sprugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>CEUR Workshop Proceedings</publisher>
<place>
<placeTerm type="text">Pisa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-12-210-7060-6</identifier>
</relatedItem>
<abstract>We present BEEP (BEst DrivEr‘s License Performer), a benchmark challenge to evaluate large language models in the context of a simulated Italian driver‘s license exam. This challenge tests the models’ ability to understand and apply traffic laws, road safety regulations, and vehicle-related knowledge through a series of true/false questions. The dataset is derived from official ministerial materials used in the Italian licensing process, specifically targeting Category B licenses.We evaluate models such as LLaMA and Mixtral across multiple categories. In addition, we simulate a driving license test to assess the models’ real-world applicability, where the pass rate is determined based on the number of errors allowed. While scaling up model size improved performance, even larger models struggled to pass the exam consistently. The challenge demonstrates the capabilities and limitations of LLMs in handling real-world, high-stakes scenarios, providing insights into their practical use and areas for further improvement.</abstract>
<identifier type="citekey">mercorio-etal-2024-beep</identifier>
<location>
<url>https://aclanthology.org/2024.clicit-1.135/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>1222</start>
<end>1227</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BEEP - BEst DrivEr‘s License Performer: A CALAMITA Challenge
%A Mercorio, Fabio
%A Potertì, Daniele
%A Serino, Antonio
%A Seveso, Andrea
%Y Dell’Orletta, Felice
%Y Lenci, Alessandro
%Y Montemagni, Simonetta
%Y Sprugnoli, Rachele
%S Proceedings of the 10th Italian Conference on Computational Linguistics (CLiC-it 2024)
%D 2024
%8 December
%I CEUR Workshop Proceedings
%C Pisa, Italy
%@ 979-12-210-7060-6
%F mercorio-etal-2024-beep
%X We present BEEP (BEst DrivEr‘s License Performer), a benchmark challenge to evaluate large language models in the context of a simulated Italian driver‘s license exam. This challenge tests the models’ ability to understand and apply traffic laws, road safety regulations, and vehicle-related knowledge through a series of true/false questions. The dataset is derived from official ministerial materials used in the Italian licensing process, specifically targeting Category B licenses.We evaluate models such as LLaMA and Mixtral across multiple categories. In addition, we simulate a driving license test to assess the models’ real-world applicability, where the pass rate is determined based on the number of errors allowed. While scaling up model size improved performance, even larger models struggled to pass the exam consistently. The challenge demonstrates the capabilities and limitations of LLMs in handling real-world, high-stakes scenarios, providing insights into their practical use and areas for further improvement.
%U https://aclanthology.org/2024.clicit-1.135/
%P 1222-1227
Markdown (Informal)
[BEEP - BEst DrivEr’s License Performer: A CALAMITA Challenge](https://aclanthology.org/2024.clicit-1.135/) (Mercorio et al., CLiC-it 2024)
ACL