@inproceedings{klimaszewski-etal-2025-avenibench,
title = "{A}veni{B}ench: Accessible and Versatile Evaluation of Finance Intelligence",
author = "Klimaszewski, Mateusz and
Chen, Pinzhen and
Guillou, Liane and
Papaioannou, Ioannis and
Haddow, Barry and
Birch, Alexandra",
editor = "Chen, Chung-Chi and
Moreno-Sandoval, Antonio and
Huang, Jimin and
Xie, Qianqian and
Ananiadou, Sophia and
Chen, Hsin-Hsi",
booktitle = "Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.finnlp-1.10/",
pages = "111--117",
abstract = "Over the last few years, there has been great interest in applying large language models (LLMs) to problems in the finance industry, and the field needs a robust LLM benchmark to support this work. Current financial LLM benchmarks contain simple tasks which are not representative of real use cases and have test sets with licences that do not allow commercial use. In response, we release AveniBench, a permissively licensed benchmark that tests a group of six key finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, summarisation and dialogue. We refactor the test sets to ensure that metrics are comparable, providing a unified framework. Furthermore, AveniBench introduces two task difficulty modes, easy and hard, enabling scalable evaluation based on real-world deployment needs. We use our benchmark to evaluate a diverse set of 20 widely used LLMs, from small open-weight models to proprietary systems like GPT-4. This evaluation initiates our public leaderboard, providing valuable insights for future academic research and commercial development."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="klimaszewski-etal-2025-avenibench">
<titleInfo>
<title>AveniBench: Accessible and Versatile Evaluation of Finance Intelligence</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mateusz</namePart>
<namePart type="family">Klimaszewski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liane</namePart>
<namePart type="family">Guillou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioannis</namePart>
<namePart type="family">Papaioannou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chung-Chi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Moreno-Sandoval</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jimin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qianqian</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Over the last few years, there has been great interest in applying large language models (LLMs) to problems in the finance industry, and the field needs a robust LLM benchmark to support this work. Current financial LLM benchmarks contain simple tasks which are not representative of real use cases and have test sets with licences that do not allow commercial use. In response, we release AveniBench, a permissively licensed benchmark that tests a group of six key finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, summarisation and dialogue. We refactor the test sets to ensure that metrics are comparable, providing a unified framework. Furthermore, AveniBench introduces two task difficulty modes, easy and hard, enabling scalable evaluation based on real-world deployment needs. We use our benchmark to evaluate a diverse set of 20 widely used LLMs, from small open-weight models to proprietary systems like GPT-4. This evaluation initiates our public leaderboard, providing valuable insights for future academic research and commercial development.</abstract>
<identifier type="citekey">klimaszewski-etal-2025-avenibench</identifier>
<location>
<url>https://aclanthology.org/2025.finnlp-1.10/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>111</start>
<end>117</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AveniBench: Accessible and Versatile Evaluation of Finance Intelligence
%A Klimaszewski, Mateusz
%A Chen, Pinzhen
%A Guillou, Liane
%A Papaioannou, Ioannis
%A Haddow, Barry
%A Birch, Alexandra
%Y Chen, Chung-Chi
%Y Moreno-Sandoval, Antonio
%Y Huang, Jimin
%Y Xie, Qianqian
%Y Ananiadou, Sophia
%Y Chen, Hsin-Hsi
%S Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F klimaszewski-etal-2025-avenibench
%X Over the last few years, there has been great interest in applying large language models (LLMs) to problems in the finance industry, and the field needs a robust LLM benchmark to support this work. Current financial LLM benchmarks contain simple tasks which are not representative of real use cases and have test sets with licences that do not allow commercial use. In response, we release AveniBench, a permissively licensed benchmark that tests a group of six key finance-related skills: tabular reasoning, numerical reasoning, question answering, long context modelling, summarisation and dialogue. We refactor the test sets to ensure that metrics are comparable, providing a unified framework. Furthermore, AveniBench introduces two task difficulty modes, easy and hard, enabling scalable evaluation based on real-world deployment needs. We use our benchmark to evaluate a diverse set of 20 widely used LLMs, from small open-weight models to proprietary systems like GPT-4. This evaluation initiates our public leaderboard, providing valuable insights for future academic research and commercial development.
%U https://aclanthology.org/2025.finnlp-1.10/
%P 111-117
Markdown (Informal)
[AveniBench: Accessible and Versatile Evaluation of Finance Intelligence](https://aclanthology.org/2025.finnlp-1.10/) (Klimaszewski et al., FinNLP 2025)
ACL
- Mateusz Klimaszewski, Pinzhen Chen, Liane Guillou, Ioannis Papaioannou, Barry Haddow, and Alexandra Birch. 2025. AveniBench: Accessible and Versatile Evaluation of Finance Intelligence. In Proceedings of the Joint Workshop of the 9th Financial Technology and Natural Language Processing (FinNLP), the 6th Financial Narrative Processing (FNP), and the 1st Workshop on Large Language Models for Finance and Legal (LLMFinLegal), pages 111–117, Abu Dhabi, UAE. Association for Computational Linguistics.