@inproceedings{siska-etal-2024-examining,
title = "Examining the robustness of {LLM} evaluation to the distributional assumptions of benchmarks",
author = "Siska, Charlotte and
Marazopoulou, Katerina and
Ailem, Melissa and
Bono, James",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.560/",
doi = "10.18653/v1/2024.acl-long.560",
pages = "10406--10421",
abstract = "Benchmarks have emerged as the central approach for evaluating Large Language Models (LLMs). The research community often relies on a model`s average performance across the test prompts of a benchmark to evaluate the model`s performance. This is consistent with the assumption that the test prompts within a benchmark represent a random sample from some real-world distribution of interest. We note that this is generally not the case; instead, we hold that the distribution of interest varies according to the specific use case. Hence, we analyze the robustness of LLM benchmarks to their underlying distributional assumptions. We find that (1) the correlation in model performance across test prompts is non-random, (2) accounting for correlations across test prompts can change model rankings on major benchmarks, (3) explanatory factors for these correlations include semantic similarity and common LLM failure points."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="siska-etal-2024-examining">
<titleInfo>
<title>Examining the robustness of LLM evaluation to the distributional assumptions of benchmarks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Charlotte</namePart>
<namePart type="family">Siska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katerina</namePart>
<namePart type="family">Marazopoulou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Melissa</namePart>
<namePart type="family">Ailem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Bono</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Benchmarks have emerged as the central approach for evaluating Large Language Models (LLMs). The research community often relies on a model‘s average performance across the test prompts of a benchmark to evaluate the model‘s performance. This is consistent with the assumption that the test prompts within a benchmark represent a random sample from some real-world distribution of interest. We note that this is generally not the case; instead, we hold that the distribution of interest varies according to the specific use case. Hence, we analyze the robustness of LLM benchmarks to their underlying distributional assumptions. We find that (1) the correlation in model performance across test prompts is non-random, (2) accounting for correlations across test prompts can change model rankings on major benchmarks, (3) explanatory factors for these correlations include semantic similarity and common LLM failure points.</abstract>
<identifier type="citekey">siska-etal-2024-examining</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.560</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.560/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>10406</start>
<end>10421</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Examining the robustness of LLM evaluation to the distributional assumptions of benchmarks
%A Siska, Charlotte
%A Marazopoulou, Katerina
%A Ailem, Melissa
%A Bono, James
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F siska-etal-2024-examining
%X Benchmarks have emerged as the central approach for evaluating Large Language Models (LLMs). The research community often relies on a model‘s average performance across the test prompts of a benchmark to evaluate the model‘s performance. This is consistent with the assumption that the test prompts within a benchmark represent a random sample from some real-world distribution of interest. We note that this is generally not the case; instead, we hold that the distribution of interest varies according to the specific use case. Hence, we analyze the robustness of LLM benchmarks to their underlying distributional assumptions. We find that (1) the correlation in model performance across test prompts is non-random, (2) accounting for correlations across test prompts can change model rankings on major benchmarks, (3) explanatory factors for these correlations include semantic similarity and common LLM failure points.
%R 10.18653/v1/2024.acl-long.560
%U https://aclanthology.org/2024.luhme-long.560/
%U https://doi.org/10.18653/v1/2024.acl-long.560
%P 10406-10421
Markdown (Informal)
[Examining the robustness of LLM evaluation to the distributional assumptions of benchmarks](https://aclanthology.org/2024.luhme-long.560/) (Siska et al., ACL 2024)
ACL