@inproceedings{bhagat-etal-2025-evaluating,
title = "Evaluating Compound {AI} Systems through Behaviors, Not Benchmarks",
author = "Bhagat, Pranav and
Shastry, K N Ajay and
Panda, Pranoy and
Devaguptapu, Chaitanya",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1314/",
pages = "24193--24222",
ISBN = "979-8-89176-335-7",
abstract = "Compound AI (CAI) systems, also referred to as LLM Agents, combine LLMs with retrievers and tools to enable information-seeking applications in the real-world. Thus, ensuring these systems perform reliably is critical. However, traditional evaluation using benchmark datasets and aggregate metrics often fails to capture their true operational performance. This is because understanding the operational efficacy of these information-seeking systems requires the ability to probe their behavior across a spectrum of simulated scenarios to identify potential failure modes. Thus, we present a behavior-driven evaluation framework that generates test specifications - explicit descriptions of expected system behaviors in specific scenarios - aligned with real usage contexts. These test specifications serve as formal declarations of system requirements that are then automatically transformed into concrete test cases. Specifically, our framework operates in two phases: (1) generating diverse test specifications via submodular optimization over semantic diversity and document coverage of the tests, and (2) implementing these specifications through graph-based pipelines supporting both tabular and textual sources. Evaluations on QuAC {\&} HybriDialogue datasets, across SoTA LLMs, reveal that our framework identifies failure modes missed by traditional metrics, demonstrating failure rates twice as high as human-curated datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bhagat-etal-2025-evaluating">
<titleInfo>
<title>Evaluating Compound AI Systems through Behaviors, Not Benchmarks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pranav</namePart>
<namePart type="family">Bhagat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">K</namePart>
<namePart type="given">N</namePart>
<namePart type="given">Ajay</namePart>
<namePart type="family">Shastry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pranoy</namePart>
<namePart type="family">Panda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaitanya</namePart>
<namePart type="family">Devaguptapu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Compound AI (CAI) systems, also referred to as LLM Agents, combine LLMs with retrievers and tools to enable information-seeking applications in the real-world. Thus, ensuring these systems perform reliably is critical. However, traditional evaluation using benchmark datasets and aggregate metrics often fails to capture their true operational performance. This is because understanding the operational efficacy of these information-seeking systems requires the ability to probe their behavior across a spectrum of simulated scenarios to identify potential failure modes. Thus, we present a behavior-driven evaluation framework that generates test specifications - explicit descriptions of expected system behaviors in specific scenarios - aligned with real usage contexts. These test specifications serve as formal declarations of system requirements that are then automatically transformed into concrete test cases. Specifically, our framework operates in two phases: (1) generating diverse test specifications via submodular optimization over semantic diversity and document coverage of the tests, and (2) implementing these specifications through graph-based pipelines supporting both tabular and textual sources. Evaluations on QuAC & HybriDialogue datasets, across SoTA LLMs, reveal that our framework identifies failure modes missed by traditional metrics, demonstrating failure rates twice as high as human-curated datasets.</abstract>
<identifier type="citekey">bhagat-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1314/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>24193</start>
<end>24222</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Compound AI Systems through Behaviors, Not Benchmarks
%A Bhagat, Pranav
%A Shastry, K. N. Ajay
%A Panda, Pranoy
%A Devaguptapu, Chaitanya
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F bhagat-etal-2025-evaluating
%X Compound AI (CAI) systems, also referred to as LLM Agents, combine LLMs with retrievers and tools to enable information-seeking applications in the real-world. Thus, ensuring these systems perform reliably is critical. However, traditional evaluation using benchmark datasets and aggregate metrics often fails to capture their true operational performance. This is because understanding the operational efficacy of these information-seeking systems requires the ability to probe their behavior across a spectrum of simulated scenarios to identify potential failure modes. Thus, we present a behavior-driven evaluation framework that generates test specifications - explicit descriptions of expected system behaviors in specific scenarios - aligned with real usage contexts. These test specifications serve as formal declarations of system requirements that are then automatically transformed into concrete test cases. Specifically, our framework operates in two phases: (1) generating diverse test specifications via submodular optimization over semantic diversity and document coverage of the tests, and (2) implementing these specifications through graph-based pipelines supporting both tabular and textual sources. Evaluations on QuAC & HybriDialogue datasets, across SoTA LLMs, reveal that our framework identifies failure modes missed by traditional metrics, demonstrating failure rates twice as high as human-curated datasets.
%U https://aclanthology.org/2025.findings-emnlp.1314/
%P 24193-24222
Markdown (Informal)
[Evaluating Compound AI Systems through Behaviors, Not Benchmarks](https://aclanthology.org/2025.findings-emnlp.1314/) (Bhagat et al., Findings 2025)
ACL