@inproceedings{el-yagouby-etal-2025-evaluating,
title = "Evaluating {LLM}s Efficiency Using Successive Attempts on Binary-Outcome Tasks",
author = "El Yagouby, Mohamed Amine and
Zekroum, Mehdi and
Lahmadi, Abdelkader and
Ghogho, Mounir and
Festor, Olivier",
editor = "Bechet, Fr{\'e}d{\'e}ric and
Chifu, Adrian-Gabriel and
Pinel-sauvagnat, Karen and
Favre, Benoit and
Maes, Eliot and
Nurbakova, Diana",
booktitle = "Actes de l'atelier {\'E}valuation des mod{\`e}les g{\'e}n{\'e}ratifs (LLM) et challenge 2025 (EvalLLM)",
month = "6",
year = "2025",
address = "Marseille, France",
publisher = "ATALA {\textbackslash}{\textbackslash}{\&} ARIA",
url = "https://aclanthology.org/2025.jeptalnrecital-evalllm.10/",
pages = "120--126",
abstract = "Evaluating Large Language Models (LLMs) using single-attempt metrics like Success Rate (SR) overlooks their capacity for iterative problem solving. In tasks with binary outcomes (success or failure), such as coding or planning, LLMs often benefit from multiple attempts. Existing multiattempt metrics like pass@k and success@k account for eventual success but ignore how efficiently it is achieved, making them more costly. We propose a new evaluation method with Successive Multiple Attempts, where a maximum number of retries is fixed, and introduce our Success Efficiency (SE) metric, which captures both success and efficiency in a single value by rewarding earlier successes and penalizing delays. Tested using the HumanEval dataset across six LLMs, SE captures how quickly an LLM solves tasks, which existing metrics do not offer. This work complements existing evaluation methods by measuring not only whether LLMs succeed but also how efficiently they do so."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="el-yagouby-etal-2025-evaluating">
<titleInfo>
<title>Evaluating LLMs Efficiency Using Successive Attempts on Binary-Outcome Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="given">Amine</namePart>
<namePart type="family">El Yagouby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehdi</namePart>
<namePart type="family">Zekroum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelkader</namePart>
<namePart type="family">Lahmadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mounir</namePart>
<namePart type="family">Ghogho</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olivier</namePart>
<namePart type="family">Festor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Actes de l’atelier Évaluation des modèles génératifs (LLM) et challenge 2025 (EvalLLM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrian-Gabriel</namePart>
<namePart type="family">Chifu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karen</namePart>
<namePart type="family">Pinel-sauvagnat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benoit</namePart>
<namePart type="family">Favre</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliot</namePart>
<namePart type="family">Maes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Nurbakova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ATALA \textbackslash\textbackslash& ARIA</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Evaluating Large Language Models (LLMs) using single-attempt metrics like Success Rate (SR) overlooks their capacity for iterative problem solving. In tasks with binary outcomes (success or failure), such as coding or planning, LLMs often benefit from multiple attempts. Existing multiattempt metrics like pass@k and success@k account for eventual success but ignore how efficiently it is achieved, making them more costly. We propose a new evaluation method with Successive Multiple Attempts, where a maximum number of retries is fixed, and introduce our Success Efficiency (SE) metric, which captures both success and efficiency in a single value by rewarding earlier successes and penalizing delays. Tested using the HumanEval dataset across six LLMs, SE captures how quickly an LLM solves tasks, which existing metrics do not offer. This work complements existing evaluation methods by measuring not only whether LLMs succeed but also how efficiently they do so.</abstract>
<identifier type="citekey">el-yagouby-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.jeptalnrecital-evalllm.10/</url>
</location>
<part>
<date>2025-6</date>
<extent unit="page">
<start>120</start>
<end>126</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating LLMs Efficiency Using Successive Attempts on Binary-Outcome Tasks
%A El Yagouby, Mohamed Amine
%A Zekroum, Mehdi
%A Lahmadi, Abdelkader
%A Ghogho, Mounir
%A Festor, Olivier
%Y Bechet, Frédéric
%Y Chifu, Adrian-Gabriel
%Y Pinel-sauvagnat, Karen
%Y Favre, Benoit
%Y Maes, Eliot
%Y Nurbakova, Diana
%S Actes de l’atelier Évaluation des modèles génératifs (LLM) et challenge 2025 (EvalLLM)
%D 2025
%8 June
%I ATALA \textbackslash\textbackslash& ARIA
%C Marseille, France
%F el-yagouby-etal-2025-evaluating
%X Evaluating Large Language Models (LLMs) using single-attempt metrics like Success Rate (SR) overlooks their capacity for iterative problem solving. In tasks with binary outcomes (success or failure), such as coding or planning, LLMs often benefit from multiple attempts. Existing multiattempt metrics like pass@k and success@k account for eventual success but ignore how efficiently it is achieved, making them more costly. We propose a new evaluation method with Successive Multiple Attempts, where a maximum number of retries is fixed, and introduce our Success Efficiency (SE) metric, which captures both success and efficiency in a single value by rewarding earlier successes and penalizing delays. Tested using the HumanEval dataset across six LLMs, SE captures how quickly an LLM solves tasks, which existing metrics do not offer. This work complements existing evaluation methods by measuring not only whether LLMs succeed but also how efficiently they do so.
%U https://aclanthology.org/2025.jeptalnrecital-evalllm.10/
%P 120-126
Markdown (Informal)
[Evaluating LLMs Efficiency Using Successive Attempts on Binary-Outcome Tasks](https://aclanthology.org/2025.jeptalnrecital-evalllm.10/) (El Yagouby et al., JEP/TALN/RECITAL 2025)
ACL