@inproceedings{hurlimann-cieliebak-2026-reprohum,
title = "{R}epro{H}um {\#}0031{--}01: Reproducing a Human Readability Evaluation for Question{--}Answer Generation Systems",
author = {H{\"u}rlimann, Manuela and
Cieliebak, Mark},
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.88/",
pages = "1111--1116",
ISBN = "979-8-89176-423-1",
abstract = "Human evaluations play a central role in assessing natural language processing systems, yet their robustness and reproducibility remain incompletely understood. This paper reports on a reproduction of the human readability evaluation from Yao et al. (2022) for question{--}answer generation (QAG) systems, conducted within the ReproHum project and the ReproNLP 2026 shared task (Belz et al., 2026). The original evaluation compared three QAG systems with respect to three criteria. We reproduced the evaluation of one of these criteria, readability, using a new group of five evaluators. We report descriptive results, inter-annotator agreement, system-level comparisons, and cross-study robustness metrics compared to the original study and two previous reproductions. Our results support all conclusions of the original evaluation and are largely consistent with two previous reproductions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hurlimann-cieliebak-2026-reprohum">
<titleInfo>
<title>ReproHum #0031–01: Reproducing a Human Readability Evaluation for Question–Answer Generation Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuela</namePart>
<namePart type="family">Hürlimann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Cieliebak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Human evaluations play a central role in assessing natural language processing systems, yet their robustness and reproducibility remain incompletely understood. This paper reports on a reproduction of the human readability evaluation from Yao et al. (2022) for question–answer generation (QAG) systems, conducted within the ReproHum project and the ReproNLP 2026 shared task (Belz et al., 2026). The original evaluation compared three QAG systems with respect to three criteria. We reproduced the evaluation of one of these criteria, readability, using a new group of five evaluators. We report descriptive results, inter-annotator agreement, system-level comparisons, and cross-study robustness metrics compared to the original study and two previous reproductions. Our results support all conclusions of the original evaluation and are largely consistent with two previous reproductions.</abstract>
<identifier type="citekey">hurlimann-cieliebak-2026-reprohum</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.88/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1111</start>
<end>1116</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproHum #0031–01: Reproducing a Human Readability Evaluation for Question–Answer Generation Systems
%A Hürlimann, Manuela
%A Cieliebak, Mark
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F hurlimann-cieliebak-2026-reprohum
%X Human evaluations play a central role in assessing natural language processing systems, yet their robustness and reproducibility remain incompletely understood. This paper reports on a reproduction of the human readability evaluation from Yao et al. (2022) for question–answer generation (QAG) systems, conducted within the ReproHum project and the ReproNLP 2026 shared task (Belz et al., 2026). The original evaluation compared three QAG systems with respect to three criteria. We reproduced the evaluation of one of these criteria, readability, using a new group of five evaluators. We report descriptive results, inter-annotator agreement, system-level comparisons, and cross-study robustness metrics compared to the original study and two previous reproductions. Our results support all conclusions of the original evaluation and are largely consistent with two previous reproductions.
%U https://aclanthology.org/2026.gem-main.88/
%P 1111-1116
Markdown (Informal)
[ReproHum #0031–01: Reproducing a Human Readability Evaluation for Question–Answer Generation Systems](https://aclanthology.org/2026.gem-main.88/) (Hürlimann & Cieliebak, GEM 2026)
ACL