@inproceedings{zaghouani-2026-position,
title = "Position: What Are We Measuring? Rethinking Evaluation in Natural Language Generation",
author = "Zaghouani, Wajdi",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.79/",
pages = "1021--1028",
ISBN = "979-8-89176-423-1",
abstract = "The field of natural language generation has accumulated a rich ecosystem of automatic evaluation metrics, yet it lacks a coherent theory of what those metrics are actually measuring. Drawing on measurement theory from the quantitative social sciences, this paper argues that current NLG evaluation practices suffer from a fundamental construct validity problem: metrics are treated as proxies for output quality without explicit specification of the underlying constructs they are meant to operationalize. We examine four dominant evaluation paradigms (reference-based metrics, embedding-based metrics, LLM-as-judge, and human evaluation) and demonstrate that each conflates construct definition with operationalization. Building on a long psychometric tradition reaching back to Cronbach and Meehl (1955) and on recent NLP work that has begun to apply this tradition to bias measurement, dialogue evaluation, and benchmark design, we propose that the field adopt a measurement modeling perspective for NLG evaluation. We borrow the concepts of construct validity, reliability, and consequential validity as a foundation for more principled evaluation, and we outline a preliminary taxonomy of NLG quality constructs as a starting point for this work."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zaghouani-2026-position">
<titleInfo>
<title>Position: What Are We Measuring? Rethinking Evaluation in Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wajdi</namePart>
<namePart type="family">Zaghouani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>The field of natural language generation has accumulated a rich ecosystem of automatic evaluation metrics, yet it lacks a coherent theory of what those metrics are actually measuring. Drawing on measurement theory from the quantitative social sciences, this paper argues that current NLG evaluation practices suffer from a fundamental construct validity problem: metrics are treated as proxies for output quality without explicit specification of the underlying constructs they are meant to operationalize. We examine four dominant evaluation paradigms (reference-based metrics, embedding-based metrics, LLM-as-judge, and human evaluation) and demonstrate that each conflates construct definition with operationalization. Building on a long psychometric tradition reaching back to Cronbach and Meehl (1955) and on recent NLP work that has begun to apply this tradition to bias measurement, dialogue evaluation, and benchmark design, we propose that the field adopt a measurement modeling perspective for NLG evaluation. We borrow the concepts of construct validity, reliability, and consequential validity as a foundation for more principled evaluation, and we outline a preliminary taxonomy of NLG quality constructs as a starting point for this work.</abstract>
<identifier type="citekey">zaghouani-2026-position</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.79/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1021</start>
<end>1028</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Position: What Are We Measuring? Rethinking Evaluation in Natural Language Generation
%A Zaghouani, Wajdi
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F zaghouani-2026-position
%X The field of natural language generation has accumulated a rich ecosystem of automatic evaluation metrics, yet it lacks a coherent theory of what those metrics are actually measuring. Drawing on measurement theory from the quantitative social sciences, this paper argues that current NLG evaluation practices suffer from a fundamental construct validity problem: metrics are treated as proxies for output quality without explicit specification of the underlying constructs they are meant to operationalize. We examine four dominant evaluation paradigms (reference-based metrics, embedding-based metrics, LLM-as-judge, and human evaluation) and demonstrate that each conflates construct definition with operationalization. Building on a long psychometric tradition reaching back to Cronbach and Meehl (1955) and on recent NLP work that has begun to apply this tradition to bias measurement, dialogue evaluation, and benchmark design, we propose that the field adopt a measurement modeling perspective for NLG evaluation. We borrow the concepts of construct validity, reliability, and consequential validity as a foundation for more principled evaluation, and we outline a preliminary taxonomy of NLG quality constructs as a starting point for this work.
%U https://aclanthology.org/2026.gem-main.79/
%P 1021-1028
Markdown (Informal)
[Position: What Are We Measuring? Rethinking Evaluation in Natural Language Generation](https://aclanthology.org/2026.gem-main.79/) (Zaghouani, GEM 2026)
ACL