@inproceedings{yuan-etal-2025-beyond-one,
title = "Beyond One-Size-Fits-All: Tailored Benchmarks for Efficient Evaluation",
author = "Yuan, Peiwen and
Zhang, Yueqi and
Feng, Shaoxiong and
Li, Yiwei and
Wang, Xinglin and
Shi, Jiayi and
Tan, Chuyi and
Pan, Boyuan and
Hu, Yao and
Li, Kan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.759/",
doi = "10.18653/v1/2025.acl-long.759",
pages = "15591--15615",
ISBN = "979-8-89176-251-0",
abstract = "Evaluating models on large benchmarks can be very resource-intensive, especially during a period of rapid model evolution. Existing efficient evaluation methods estimate the performance of target models by testing them on a small, static coreset derived from the publicly available evaluation results of source models, which are separate from the target models. However, these approaches rely on the assumption that target models have high prediction consistency with source models, which doesn{'}t generalize well in practice. To fill this gap, we propose TailoredBench, a method that conducts customized evaluation tailored to each target model. Specifically, a Global-coreset is first constructed as a probe to identify the most consistent source models for each target model with an adaptive source model selection strategy. Afterwards, a scalable K-Medoids clustering algorithm is proposed to extend the Global-coreset to a tailored Native-coreset for each target model. According to the predictions on respective Native-coreset, we estimate the overall performance of target models with a calibrated estimation strategy. Comprehensive experiments on five benchmarks across over 300 models demonstrate that compared to best performing baselines, TailoredBench achieves an average reduction of 31.4{\%} in MAE of accuracy estimates under the same inference budgets, showcasing strong effectiveness and generalizability."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2025-beyond-one">
<titleInfo>
<title>Beyond One-Size-Fits-All: Tailored Benchmarks for Efficient Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peiwen</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yueqi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaoxiong</namePart>
<namePart type="family">Feng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yiwei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinglin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuyi</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Boyuan</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Evaluating models on large benchmarks can be very resource-intensive, especially during a period of rapid model evolution. Existing efficient evaluation methods estimate the performance of target models by testing them on a small, static coreset derived from the publicly available evaluation results of source models, which are separate from the target models. However, these approaches rely on the assumption that target models have high prediction consistency with source models, which doesn’t generalize well in practice. To fill this gap, we propose TailoredBench, a method that conducts customized evaluation tailored to each target model. Specifically, a Global-coreset is first constructed as a probe to identify the most consistent source models for each target model with an adaptive source model selection strategy. Afterwards, a scalable K-Medoids clustering algorithm is proposed to extend the Global-coreset to a tailored Native-coreset for each target model. According to the predictions on respective Native-coreset, we estimate the overall performance of target models with a calibrated estimation strategy. Comprehensive experiments on five benchmarks across over 300 models demonstrate that compared to best performing baselines, TailoredBench achieves an average reduction of 31.4% in MAE of accuracy estimates under the same inference budgets, showcasing strong effectiveness and generalizability.</abstract>
<identifier type="citekey">yuan-etal-2025-beyond-one</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.759</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.759/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>15591</start>
<end>15615</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond One-Size-Fits-All: Tailored Benchmarks for Efficient Evaluation
%A Yuan, Peiwen
%A Zhang, Yueqi
%A Feng, Shaoxiong
%A Li, Yiwei
%A Wang, Xinglin
%A Shi, Jiayi
%A Tan, Chuyi
%A Pan, Boyuan
%A Hu, Yao
%A Li, Kan
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F yuan-etal-2025-beyond-one
%X Evaluating models on large benchmarks can be very resource-intensive, especially during a period of rapid model evolution. Existing efficient evaluation methods estimate the performance of target models by testing them on a small, static coreset derived from the publicly available evaluation results of source models, which are separate from the target models. However, these approaches rely on the assumption that target models have high prediction consistency with source models, which doesn’t generalize well in practice. To fill this gap, we propose TailoredBench, a method that conducts customized evaluation tailored to each target model. Specifically, a Global-coreset is first constructed as a probe to identify the most consistent source models for each target model with an adaptive source model selection strategy. Afterwards, a scalable K-Medoids clustering algorithm is proposed to extend the Global-coreset to a tailored Native-coreset for each target model. According to the predictions on respective Native-coreset, we estimate the overall performance of target models with a calibrated estimation strategy. Comprehensive experiments on five benchmarks across over 300 models demonstrate that compared to best performing baselines, TailoredBench achieves an average reduction of 31.4% in MAE of accuracy estimates under the same inference budgets, showcasing strong effectiveness and generalizability.
%R 10.18653/v1/2025.acl-long.759
%U https://aclanthology.org/2025.acl-long.759/
%U https://doi.org/10.18653/v1/2025.acl-long.759
%P 15591-15615
Markdown (Informal)
[Beyond One-Size-Fits-All: Tailored Benchmarks for Efficient Evaluation](https://aclanthology.org/2025.acl-long.759/) (Yuan et al., ACL 2025)
ACL
- Peiwen Yuan, Yueqi Zhang, Shaoxiong Feng, Yiwei Li, Xinglin Wang, Jiayi Shi, Chuyi Tan, Boyuan Pan, Yao Hu, and Kan Li. 2025. Beyond One-Size-Fits-All: Tailored Benchmarks for Efficient Evaluation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 15591–15615, Vienna, Austria. Association for Computational Linguistics.