@inproceedings{you-etal-2024-llm,
title = "{LLM}-Evolve: Evaluation for {LLM}{'}s Evolving Capability on Benchmarks",
author = "You, Jiaxuan and
Liu, Mingjie and
Prabhumoye, Shrimai and
Patwary, Mostofa and
Shoeybi, Mohammad and
Catanzaro, Bryan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.940",
pages = "16937--16942",
abstract = "The advancement of large language models (LLMs) has extended their use to dynamic and interactive real-world applications, where models engage continuously with their environment and potentially enhance their performance over time. Most existing LLM benchmarks evaluate LLMs on i.i.d. tasks, overlooking their ability to learn iteratively from past experiences. Our paper bridges this evaluation gap by proposing a novel framework, LLM-Evolve, which extends established benchmarks to sequential problem-solving settings. LLM-Evolve evaluates LLMs over multiple rounds, providing feedback after each round to build a demonstration memory that the models can query in future tasks. We applied LLM-Evolve to the MMLU, GSM8K, and AgentBench benchmarks, testing 8 state-of-the-art open-source and closed-source models. Results show that LLMs can achieve performance improvements of up to 17{\%} by learning from past interactions, with the quality of retrieval algorithms and feedback significantly influencing this capability. These insights advocate for more understanding and benchmarks for LLMs{'} performance in evolving interactive scenarios.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="you-etal-2024-llm">
<titleInfo>
<title>LLM-Evolve: Evaluation for LLM’s Evolving Capability on Benchmarks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiaxuan</namePart>
<namePart type="family">You</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingjie</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shrimai</namePart>
<namePart type="family">Prabhumoye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mostofa</namePart>
<namePart type="family">Patwary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Shoeybi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="family">Catanzaro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The advancement of large language models (LLMs) has extended their use to dynamic and interactive real-world applications, where models engage continuously with their environment and potentially enhance their performance over time. Most existing LLM benchmarks evaluate LLMs on i.i.d. tasks, overlooking their ability to learn iteratively from past experiences. Our paper bridges this evaluation gap by proposing a novel framework, LLM-Evolve, which extends established benchmarks to sequential problem-solving settings. LLM-Evolve evaluates LLMs over multiple rounds, providing feedback after each round to build a demonstration memory that the models can query in future tasks. We applied LLM-Evolve to the MMLU, GSM8K, and AgentBench benchmarks, testing 8 state-of-the-art open-source and closed-source models. Results show that LLMs can achieve performance improvements of up to 17% by learning from past interactions, with the quality of retrieval algorithms and feedback significantly influencing this capability. These insights advocate for more understanding and benchmarks for LLMs’ performance in evolving interactive scenarios.</abstract>
<identifier type="citekey">you-etal-2024-llm</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.940</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>16937</start>
<end>16942</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLM-Evolve: Evaluation for LLM’s Evolving Capability on Benchmarks
%A You, Jiaxuan
%A Liu, Mingjie
%A Prabhumoye, Shrimai
%A Patwary, Mostofa
%A Shoeybi, Mohammad
%A Catanzaro, Bryan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F you-etal-2024-llm
%X The advancement of large language models (LLMs) has extended their use to dynamic and interactive real-world applications, where models engage continuously with their environment and potentially enhance their performance over time. Most existing LLM benchmarks evaluate LLMs on i.i.d. tasks, overlooking their ability to learn iteratively from past experiences. Our paper bridges this evaluation gap by proposing a novel framework, LLM-Evolve, which extends established benchmarks to sequential problem-solving settings. LLM-Evolve evaluates LLMs over multiple rounds, providing feedback after each round to build a demonstration memory that the models can query in future tasks. We applied LLM-Evolve to the MMLU, GSM8K, and AgentBench benchmarks, testing 8 state-of-the-art open-source and closed-source models. Results show that LLMs can achieve performance improvements of up to 17% by learning from past interactions, with the quality of retrieval algorithms and feedback significantly influencing this capability. These insights advocate for more understanding and benchmarks for LLMs’ performance in evolving interactive scenarios.
%U https://aclanthology.org/2024.emnlp-main.940
%P 16937-16942
Markdown (Informal)
[LLM-Evolve: Evaluation for LLM’s Evolving Capability on Benchmarks](https://aclanthology.org/2024.emnlp-main.940) (You et al., EMNLP 2024)
ACL
- Jiaxuan You, Mingjie Liu, Shrimai Prabhumoye, Mostofa Patwary, Mohammad Shoeybi, and Bryan Catanzaro. 2024. LLM-Evolve: Evaluation for LLM’s Evolving Capability on Benchmarks. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 16937–16942, Miami, Florida, USA. Association for Computational Linguistics.