@inproceedings{dong-etal-2025-model,
title = "Model Performance-Guided Evaluation Data Selection for Effective Prompt Optimization",
author = "Dong, Ximing and
Wang, Shaowei and
Lin, Dayi and
Hassan, Ahmed",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.147/",
doi = "10.18653/v1/2025.findings-acl.147",
pages = "2844--2859",
ISBN = "979-8-89176-256-5",
abstract = "Optimizing Large Language Model (LLM) performance requires well-crafted prompts, but manual prompt engineering is labor-intensive and often ineffective. Automated prompt optimization techniques address this challenge but the major of them rely on randomly selected evaluation subsets, which fail to represent the full dataset, leading to unreliable evaluations and suboptimal prompts. Existing coreset selection methods, designed for LLM benchmarking, are unsuitable for prompt optimization due to challenges in clustering similar samples, high data collection costs, and the unavailability of performance data for new or private datasets. To overcome these issues, we propose IPOMP, an Iterative evaluation data selection approach for effective Prompt Optimization using real time Model Performance. IPOMP is a two-stage approach that selects representative and diverse samples using semantic clustering and boundary analysis, followed by iterative refinement with real-time model performance data to replace redundant samples. Evaluations on two datasets BIG-bench and LIAR, and two models GPT-3.5 and GPT-4o-mini, show that IPOMP improves effectiveness by at least 1.6{\%} to 3.1{\%}, and stability by at least 50{\%} to 55.5{\%} compared with the best baseline across the studied datasets and models, with minimal computational overhead below 1{\%}. Furthermore, the results demonstrate that our real-time performance-guided refinement approach can be universally applied to enhance existing coreset selection methods."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dong-etal-2025-model">
<titleInfo>
<title>Model Performance-Guided Evaluation Data Selection for Effective Prompt Optimization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ximing</namePart>
<namePart type="family">Dong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaowei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dayi</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Optimizing Large Language Model (LLM) performance requires well-crafted prompts, but manual prompt engineering is labor-intensive and often ineffective. Automated prompt optimization techniques address this challenge but the major of them rely on randomly selected evaluation subsets, which fail to represent the full dataset, leading to unreliable evaluations and suboptimal prompts. Existing coreset selection methods, designed for LLM benchmarking, are unsuitable for prompt optimization due to challenges in clustering similar samples, high data collection costs, and the unavailability of performance data for new or private datasets. To overcome these issues, we propose IPOMP, an Iterative evaluation data selection approach for effective Prompt Optimization using real time Model Performance. IPOMP is a two-stage approach that selects representative and diverse samples using semantic clustering and boundary analysis, followed by iterative refinement with real-time model performance data to replace redundant samples. Evaluations on two datasets BIG-bench and LIAR, and two models GPT-3.5 and GPT-4o-mini, show that IPOMP improves effectiveness by at least 1.6% to 3.1%, and stability by at least 50% to 55.5% compared with the best baseline across the studied datasets and models, with minimal computational overhead below 1%. Furthermore, the results demonstrate that our real-time performance-guided refinement approach can be universally applied to enhance existing coreset selection methods.</abstract>
<identifier type="citekey">dong-etal-2025-model</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.147</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.147/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>2844</start>
<end>2859</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Model Performance-Guided Evaluation Data Selection for Effective Prompt Optimization
%A Dong, Ximing
%A Wang, Shaowei
%A Lin, Dayi
%A Hassan, Ahmed
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F dong-etal-2025-model
%X Optimizing Large Language Model (LLM) performance requires well-crafted prompts, but manual prompt engineering is labor-intensive and often ineffective. Automated prompt optimization techniques address this challenge but the major of them rely on randomly selected evaluation subsets, which fail to represent the full dataset, leading to unreliable evaluations and suboptimal prompts. Existing coreset selection methods, designed for LLM benchmarking, are unsuitable for prompt optimization due to challenges in clustering similar samples, high data collection costs, and the unavailability of performance data for new or private datasets. To overcome these issues, we propose IPOMP, an Iterative evaluation data selection approach for effective Prompt Optimization using real time Model Performance. IPOMP is a two-stage approach that selects representative and diverse samples using semantic clustering and boundary analysis, followed by iterative refinement with real-time model performance data to replace redundant samples. Evaluations on two datasets BIG-bench and LIAR, and two models GPT-3.5 and GPT-4o-mini, show that IPOMP improves effectiveness by at least 1.6% to 3.1%, and stability by at least 50% to 55.5% compared with the best baseline across the studied datasets and models, with minimal computational overhead below 1%. Furthermore, the results demonstrate that our real-time performance-guided refinement approach can be universally applied to enhance existing coreset selection methods.
%R 10.18653/v1/2025.findings-acl.147
%U https://aclanthology.org/2025.findings-acl.147/
%U https://doi.org/10.18653/v1/2025.findings-acl.147
%P 2844-2859
Markdown (Informal)
[Model Performance-Guided Evaluation Data Selection for Effective Prompt Optimization](https://aclanthology.org/2025.findings-acl.147/) (Dong et al., Findings 2025)
ACL