@inproceedings{kawamura-etal-2026-protea,
title = "{PROTEA}: Offline Evaluation and Iterative Refinement for Multi-Agent {LLM} Workflows",
author = "Kawamura, Kazuki and
Waki, Satoshi and
Tateno, Kei",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-demo.3/",
pages = "27--35",
ISBN = "979-8-89176-392-0",
abstract = "Multi-agent LLM workflows, which are AI systems composed of multiple role-specialized LLM calls, often outperform single prompts, but they are notoriously difficult to debug and refine. Failures can originate from subtle mistakes in intermediate artifacts that silently propagate downstream, forcing developers to read long traces and guess which agent to edit. We present PROTEA, a unified UI that closes the loop for offline, test-case{--}driven improvement of multi-agent workflows, enabling developers to efficiently diagnose and fix errors without manual inspection of long traces. PROTEA executes a workflow, scores intermediate artifacts with configurable evaluators, and overlays per-node states and rationales on the workflow graph to localize likely bottlenecks. To address the difficulty of preparing intermediate reference in complex systems, PROTEA performs backward node evaluation by inferring each node{'}s ideal expected output from terminal supervision and graph context, and comparing it with the observed node output. For a selected node, it proposes a targeted prompt patch as an editable diff, then automatically re-runs and re-evaluates the workflow to show before/after output diffs and score trajectories within the same interface. Using PROTEA, users can visually pinpoint system-wide bottlenecks at a glance, streamline remediation via semi-automated prompt patching, and immediately verify pre- and post-correction outcomes within a unified loop."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kawamura-etal-2026-protea">
<titleInfo>
<title>PROTEA: Offline Evaluation and Iterative Refinement for Multi-Agent LLM Workflows</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kazuki</namePart>
<namePart type="family">Kawamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Satoshi</namePart>
<namePart type="family">Waki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kei</namePart>
<namePart type="family">Tateno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ping</namePart>
<namePart type="family">Jian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-392-0</identifier>
</relatedItem>
<abstract>Multi-agent LLM workflows, which are AI systems composed of multiple role-specialized LLM calls, often outperform single prompts, but they are notoriously difficult to debug and refine. Failures can originate from subtle mistakes in intermediate artifacts that silently propagate downstream, forcing developers to read long traces and guess which agent to edit. We present PROTEA, a unified UI that closes the loop for offline, test-case–driven improvement of multi-agent workflows, enabling developers to efficiently diagnose and fix errors without manual inspection of long traces. PROTEA executes a workflow, scores intermediate artifacts with configurable evaluators, and overlays per-node states and rationales on the workflow graph to localize likely bottlenecks. To address the difficulty of preparing intermediate reference in complex systems, PROTEA performs backward node evaluation by inferring each node’s ideal expected output from terminal supervision and graph context, and comparing it with the observed node output. For a selected node, it proposes a targeted prompt patch as an editable diff, then automatically re-runs and re-evaluates the workflow to show before/after output diffs and score trajectories within the same interface. Using PROTEA, users can visually pinpoint system-wide bottlenecks at a glance, streamline remediation via semi-automated prompt patching, and immediately verify pre- and post-correction outcomes within a unified loop.</abstract>
<identifier type="citekey">kawamura-etal-2026-protea</identifier>
<location>
<url>https://aclanthology.org/2026.acl-demo.3/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>27</start>
<end>35</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PROTEA: Offline Evaluation and Iterative Refinement for Multi-Agent LLM Workflows
%A Kawamura, Kazuki
%A Waki, Satoshi
%A Tateno, Kei
%Y Durrett, Greg
%Y Jian, Ping
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-392-0
%F kawamura-etal-2026-protea
%X Multi-agent LLM workflows, which are AI systems composed of multiple role-specialized LLM calls, often outperform single prompts, but they are notoriously difficult to debug and refine. Failures can originate from subtle mistakes in intermediate artifacts that silently propagate downstream, forcing developers to read long traces and guess which agent to edit. We present PROTEA, a unified UI that closes the loop for offline, test-case–driven improvement of multi-agent workflows, enabling developers to efficiently diagnose and fix errors without manual inspection of long traces. PROTEA executes a workflow, scores intermediate artifacts with configurable evaluators, and overlays per-node states and rationales on the workflow graph to localize likely bottlenecks. To address the difficulty of preparing intermediate reference in complex systems, PROTEA performs backward node evaluation by inferring each node’s ideal expected output from terminal supervision and graph context, and comparing it with the observed node output. For a selected node, it proposes a targeted prompt patch as an editable diff, then automatically re-runs and re-evaluates the workflow to show before/after output diffs and score trajectories within the same interface. Using PROTEA, users can visually pinpoint system-wide bottlenecks at a glance, streamline remediation via semi-automated prompt patching, and immediately verify pre- and post-correction outcomes within a unified loop.
%U https://aclanthology.org/2026.acl-demo.3/
%P 27-35
Markdown (Informal)
[PROTEA: Offline Evaluation and Iterative Refinement for Multi-Agent LLM Workflows](https://aclanthology.org/2026.acl-demo.3/) (Kawamura et al., ACL 2026)
ACL