@inproceedings{santillan-cooper-etal-2025-synthetic,
title = "Synthetic Data for Evaluation: Supporting {LLM}-as-a-Judge Workflows with {E}val{A}ssist",
author = "Santill{\'a}n Cooper, Mart{\'i}n and
Ashktorab, Zahra and
Do, Hyo Jin and
Miehling, Erik and
Geyer, Werner and
Gajcin, Jasmina and
Daly, Elizabeth M. and
Pan, Qian and
Desmond, Michael",
editor = {Habernal, Ivan and
Schulam, Peter and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-demos.1/",
pages = "1--11",
ISBN = "979-8-89176-334-0",
abstract = "We present a synthetic data generation tool integrated into EvalAssist. EvalAssist is a web-based application designed to assist human-centered evaluation of language model outputs by allowing users to refine LLM-as-a-Judge evaluation criteria. The synthetic data generation tool in EvalAssist is tailored for evaluation contexts and informed by findings from user studies with AI practitioners. Participants identified key pain points in current workflows including circularity risks (where models are judged by criteria derived by themselves), compounded bias (amplification of biases across multiple stages of a pipeline), and poor support for edge cases, and expressed a strong preference for real-world grounding and fine-grained control. In response, our tool supports flexible prompting, RAG-based grounding, persona diversity, and iterative generation workflows. We also incorporate features for quality assurance and edge case discovery."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="santillan-cooper-etal-2025-synthetic">
<titleInfo>
<title>Synthetic Data for Evaluation: Supporting LLM-as-a-Judge Workflows with EvalAssist</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martín</namePart>
<namePart type="family">Santillán Cooper</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zahra</namePart>
<namePart type="family">Ashktorab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hyo</namePart>
<namePart type="given">Jin</namePart>
<namePart type="family">Do</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Erik</namePart>
<namePart type="family">Miehling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Werner</namePart>
<namePart type="family">Geyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jasmina</namePart>
<namePart type="family">Gajcin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Daly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Pan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Desmond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Habernal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Schulam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-334-0</identifier>
</relatedItem>
<abstract>We present a synthetic data generation tool integrated into EvalAssist. EvalAssist is a web-based application designed to assist human-centered evaluation of language model outputs by allowing users to refine LLM-as-a-Judge evaluation criteria. The synthetic data generation tool in EvalAssist is tailored for evaluation contexts and informed by findings from user studies with AI practitioners. Participants identified key pain points in current workflows including circularity risks (where models are judged by criteria derived by themselves), compounded bias (amplification of biases across multiple stages of a pipeline), and poor support for edge cases, and expressed a strong preference for real-world grounding and fine-grained control. In response, our tool supports flexible prompting, RAG-based grounding, persona diversity, and iterative generation workflows. We also incorporate features for quality assurance and edge case discovery.</abstract>
<identifier type="citekey">santillan-cooper-etal-2025-synthetic</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-demos.1/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>1</start>
<end>11</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic Data for Evaluation: Supporting LLM-as-a-Judge Workflows with EvalAssist
%A Santillán Cooper, Martín
%A Ashktorab, Zahra
%A Do, Hyo Jin
%A Miehling, Erik
%A Geyer, Werner
%A Gajcin, Jasmina
%A Daly, Elizabeth M.
%A Pan, Qian
%A Desmond, Michael
%Y Habernal, Ivan
%Y Schulam, Peter
%Y Tiedemann, Jörg
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-334-0
%F santillan-cooper-etal-2025-synthetic
%X We present a synthetic data generation tool integrated into EvalAssist. EvalAssist is a web-based application designed to assist human-centered evaluation of language model outputs by allowing users to refine LLM-as-a-Judge evaluation criteria. The synthetic data generation tool in EvalAssist is tailored for evaluation contexts and informed by findings from user studies with AI practitioners. Participants identified key pain points in current workflows including circularity risks (where models are judged by criteria derived by themselves), compounded bias (amplification of biases across multiple stages of a pipeline), and poor support for edge cases, and expressed a strong preference for real-world grounding and fine-grained control. In response, our tool supports flexible prompting, RAG-based grounding, persona diversity, and iterative generation workflows. We also incorporate features for quality assurance and edge case discovery.
%U https://aclanthology.org/2025.emnlp-demos.1/
%P 1-11
Markdown (Informal)
[Synthetic Data for Evaluation: Supporting LLM-as-a-Judge Workflows with EvalAssist](https://aclanthology.org/2025.emnlp-demos.1/) (Santillán Cooper et al., EMNLP 2025)
ACL
- Martín Santillán Cooper, Zahra Ashktorab, Hyo Jin Do, Erik Miehling, Werner Geyer, Jasmina Gajcin, Elizabeth M. Daly, Qian Pan, and Michael Desmond. 2025. Synthetic Data for Evaluation: Supporting LLM-as-a-Judge Workflows with EvalAssist. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 1–11, Suzhou, China. Association for Computational Linguistics.