@inproceedings{vir-etal-2025-promptevals,
title = "{PROMPTEVALS}: A Dataset of Assertions and Guardrails for Custom Production Large Language Model Pipelines",
author = "Vir, Reya and
Shankar, Shreya and
Chase, Harrison and
Hinthorn, William and
Parameswaran, Aditya",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.213/",
doi = "10.18653/v1/2025.naacl-long.213",
pages = "4225--4245",
ISBN = "979-8-89176-189-6",
abstract = "Large language models (LLMs) are increasingly deployed in specialized production data processing pipelines across diverse domains{---}such as finance, marketing, and e-commerce. However, when running them in production across many inputs, they often fail to follow instructions or meet developer expectations. To improve reliability in these applications, creating assertions or guardrails for LLM outputs to run alongside the pipelines is essential. Yet, determining the right set of assertions that capture developer requirements for a task is challenging. In this paper, we introduce PROMPTEVALS, a dataset of 2087 LLM pipeline prompts with 12623 corresponding assertion criteria, sourced from developers using our open-source LLM pipeline tools. This dataset is larger than previous collections. Using a hold-out test split of PROMPTEVALS as a benchmark, we evaluated closed- and open-source models in generating relevant assertions. Notably, our fine-tuned Mistral and Llama 3 models outperform GPT-4o by 20.93{\%} on average, offering both reduced latency and improved performance. We believe our dataset can spur further research in LLM reliability, alignment, and prompt engineering."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vir-etal-2025-promptevals">
<titleInfo>
<title>PROMPTEVALS: A Dataset of Assertions and Guardrails for Custom Production Large Language Model Pipelines</title>
</titleInfo>
<name type="personal">
<namePart type="given">Reya</namePart>
<namePart type="family">Vir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shreya</namePart>
<namePart type="family">Shankar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harrison</namePart>
<namePart type="family">Chase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="family">Hinthorn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Parameswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are increasingly deployed in specialized production data processing pipelines across diverse domains—such as finance, marketing, and e-commerce. However, when running them in production across many inputs, they often fail to follow instructions or meet developer expectations. To improve reliability in these applications, creating assertions or guardrails for LLM outputs to run alongside the pipelines is essential. Yet, determining the right set of assertions that capture developer requirements for a task is challenging. In this paper, we introduce PROMPTEVALS, a dataset of 2087 LLM pipeline prompts with 12623 corresponding assertion criteria, sourced from developers using our open-source LLM pipeline tools. This dataset is larger than previous collections. Using a hold-out test split of PROMPTEVALS as a benchmark, we evaluated closed- and open-source models in generating relevant assertions. Notably, our fine-tuned Mistral and Llama 3 models outperform GPT-4o by 20.93% on average, offering both reduced latency and improved performance. We believe our dataset can spur further research in LLM reliability, alignment, and prompt engineering.</abstract>
<identifier type="citekey">vir-etal-2025-promptevals</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.213</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.213/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>4225</start>
<end>4245</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PROMPTEVALS: A Dataset of Assertions and Guardrails for Custom Production Large Language Model Pipelines
%A Vir, Reya
%A Shankar, Shreya
%A Chase, Harrison
%A Hinthorn, William
%A Parameswaran, Aditya
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F vir-etal-2025-promptevals
%X Large language models (LLMs) are increasingly deployed in specialized production data processing pipelines across diverse domains—such as finance, marketing, and e-commerce. However, when running them in production across many inputs, they often fail to follow instructions or meet developer expectations. To improve reliability in these applications, creating assertions or guardrails for LLM outputs to run alongside the pipelines is essential. Yet, determining the right set of assertions that capture developer requirements for a task is challenging. In this paper, we introduce PROMPTEVALS, a dataset of 2087 LLM pipeline prompts with 12623 corresponding assertion criteria, sourced from developers using our open-source LLM pipeline tools. This dataset is larger than previous collections. Using a hold-out test split of PROMPTEVALS as a benchmark, we evaluated closed- and open-source models in generating relevant assertions. Notably, our fine-tuned Mistral and Llama 3 models outperform GPT-4o by 20.93% on average, offering both reduced latency and improved performance. We believe our dataset can spur further research in LLM reliability, alignment, and prompt engineering.
%R 10.18653/v1/2025.naacl-long.213
%U https://aclanthology.org/2025.naacl-long.213/
%U https://doi.org/10.18653/v1/2025.naacl-long.213
%P 4225-4245
Markdown (Informal)
[PROMPTEVALS: A Dataset of Assertions and Guardrails for Custom Production Large Language Model Pipelines](https://aclanthology.org/2025.naacl-long.213/) (Vir et al., NAACL 2025)
ACL