@inproceedings{thomson-etal-2026-process,
title = "Process Standardisation for Human Evaluation of {NLP} System Outputs",
author = "Thomson, Craig and
Gonz{\'a}lez Corbelle, Javier and
Belz, Anya",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.64/",
pages = "704--717",
ISBN = "979-8-89176-423-1",
abstract = "Human evaluation of NLP systems has high knowledge and effort thresholds. Researchers are often expected to design and run evaluations without formal training, while also creating the required resources from scratch. Recent work has started to address the knowledge threshold, but reusable tools that reduce effort remain limited. In this paper, we take a first step toward automated human-evaluation experiment creation by (i) surveying the processes and data resources used in a representative sample of current human evaluations in NLP, and (ii) deriving a canonical process model from these survey results, which (iii) provides a basis for standardised experiment design and automated toolkit development. The survey shows that recent human-evaluation practices are highly aligned in process structure, making reusable automation feasible."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thomson-etal-2026-process">
<titleInfo>
<title>Process Standardisation for Human Evaluation of NLP System Outputs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Javier</namePart>
<namePart type="family">González Corbelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Human evaluation of NLP systems has high knowledge and effort thresholds. Researchers are often expected to design and run evaluations without formal training, while also creating the required resources from scratch. Recent work has started to address the knowledge threshold, but reusable tools that reduce effort remain limited. In this paper, we take a first step toward automated human-evaluation experiment creation by (i) surveying the processes and data resources used in a representative sample of current human evaluations in NLP, and (ii) deriving a canonical process model from these survey results, which (iii) provides a basis for standardised experiment design and automated toolkit development. The survey shows that recent human-evaluation practices are highly aligned in process structure, making reusable automation feasible.</abstract>
<identifier type="citekey">thomson-etal-2026-process</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.64/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>704</start>
<end>717</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Process Standardisation for Human Evaluation of NLP System Outputs
%A Thomson, Craig
%A González Corbelle, Javier
%A Belz, Anya
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F thomson-etal-2026-process
%X Human evaluation of NLP systems has high knowledge and effort thresholds. Researchers are often expected to design and run evaluations without formal training, while also creating the required resources from scratch. Recent work has started to address the knowledge threshold, but reusable tools that reduce effort remain limited. In this paper, we take a first step toward automated human-evaluation experiment creation by (i) surveying the processes and data resources used in a representative sample of current human evaluations in NLP, and (ii) deriving a canonical process model from these survey results, which (iii) provides a basis for standardised experiment design and automated toolkit development. The survey shows that recent human-evaluation practices are highly aligned in process structure, making reusable automation feasible.
%U https://aclanthology.org/2026.gem-main.64/
%P 704-717
Markdown (Informal)
[Process Standardisation for Human Evaluation of NLP System Outputs](https://aclanthology.org/2026.gem-main.64/) (Thomson et al., GEM 2026)
ACL