@inproceedings{choshen-etal-2024-navigating,
title = "Navigating the Modern Evaluation Landscape: Considerations in Benchmarks and Frameworks for Large Language Models ({LLM}s)",
author = "Choshen, Leshem and
Gera, Ariel and
Perlitz, Yotam and
Shmueli-Scheuer, Michal and
Stanovsky, Gabriel",
editor = "Klinger, Roman and
Okazaki, Naozaki and
Calzolari, Nicoletta and
Kan, Min-Yen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-tutorials.4/",
pages = "19--25",
abstract = "General-Purpose Language Models have changed the world of Natural Language Processing, if not the world itself. The evaluation of such versatile models, while supposedly similar to evaluation of generation models before them, in fact presents a host of new evaluation challenges and opportunities. In this Tutorial, we will start from the building blocks of evaluation. The tutorial welcomes people from diverse backgrounds and assumes little familiarity with metrics, datasets, prompts and benchmarks. It will lay the foundations and explain the basics and their importance, while touching on the major points and breakthroughs of the recent era of evaluation. It will also compare traditional evaluation methods {--} which are still widely used {--} to newly developed methods. We will contrast new to old approaches, from evaluating on many-task benchmarks rather than on dedicated datasets to efficiency constraints, and from testing stability and prompts on in-context learning to using the models themselves as evaluation metrics. Finally, the tutorial will cover practical issues, ranging from reviewing widely-used benchmarks and prompt banks to efficient evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="choshen-etal-2024-navigating">
<titleInfo>
<title>Navigating the Modern Evaluation Landscape: Considerations in Benchmarks and Frameworks for Large Language Models (LLMs)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leshem</namePart>
<namePart type="family">Choshen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ariel</namePart>
<namePart type="family">Gera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli-Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Klinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naozaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>General-Purpose Language Models have changed the world of Natural Language Processing, if not the world itself. The evaluation of such versatile models, while supposedly similar to evaluation of generation models before them, in fact presents a host of new evaluation challenges and opportunities. In this Tutorial, we will start from the building blocks of evaluation. The tutorial welcomes people from diverse backgrounds and assumes little familiarity with metrics, datasets, prompts and benchmarks. It will lay the foundations and explain the basics and their importance, while touching on the major points and breakthroughs of the recent era of evaluation. It will also compare traditional evaluation methods – which are still widely used – to newly developed methods. We will contrast new to old approaches, from evaluating on many-task benchmarks rather than on dedicated datasets to efficiency constraints, and from testing stability and prompts on in-context learning to using the models themselves as evaluation metrics. Finally, the tutorial will cover practical issues, ranging from reviewing widely-used benchmarks and prompt banks to efficient evaluation.</abstract>
<identifier type="citekey">choshen-etal-2024-navigating</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-tutorials.4/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>19</start>
<end>25</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Navigating the Modern Evaluation Landscape: Considerations in Benchmarks and Frameworks for Large Language Models (LLMs)
%A Choshen, Leshem
%A Gera, Ariel
%A Perlitz, Yotam
%A Shmueli-Scheuer, Michal
%A Stanovsky, Gabriel
%Y Klinger, Roman
%Y Okazaki, Naozaki
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F choshen-etal-2024-navigating
%X General-Purpose Language Models have changed the world of Natural Language Processing, if not the world itself. The evaluation of such versatile models, while supposedly similar to evaluation of generation models before them, in fact presents a host of new evaluation challenges and opportunities. In this Tutorial, we will start from the building blocks of evaluation. The tutorial welcomes people from diverse backgrounds and assumes little familiarity with metrics, datasets, prompts and benchmarks. It will lay the foundations and explain the basics and their importance, while touching on the major points and breakthroughs of the recent era of evaluation. It will also compare traditional evaluation methods – which are still widely used – to newly developed methods. We will contrast new to old approaches, from evaluating on many-task benchmarks rather than on dedicated datasets to efficiency constraints, and from testing stability and prompts on in-context learning to using the models themselves as evaluation metrics. Finally, the tutorial will cover practical issues, ranging from reviewing widely-used benchmarks and prompt banks to efficient evaluation.
%U https://aclanthology.org/2024.lrec-tutorials.4/
%P 19-25
Markdown (Informal)
[Navigating the Modern Evaluation Landscape: Considerations in Benchmarks and Frameworks for Large Language Models (LLMs)](https://aclanthology.org/2024.lrec-tutorials.4/) (Choshen et al., LREC-COLING 2024)
ACL