@inproceedings{mroczek-etal-2026-repronlp,
title = "{R}epro{NLP} 2026: A Third Replication of the Human Evaluation of a {QAG} System for Children{'}s Storybooks",
author = "Mroczek, Marcel and
Albarello, Chiara and
Floch, Paul-Emmanuel and
Gawinecki, Maciej",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.85/",
pages = "1082--1093",
ISBN = "979-8-89176-423-1",
abstract = "Abstract: Reproducibility of human evaluations in Natural Language Processing remains a critical open challenge. This paper presents a third independent replication of the human evaluation from Yao et al. (2022), which assessed an automated Question-Answer Generation (QAG) system for children{'}s storybooks against a baseline system and human-authored ground truth, across three criteria {---} Readability, Question Relevance, and Answer Relevance {---} using five NLP-literate annotators. Our replication confirms the main findings of the original study: the QAG system outperforms the baseline on Readability and Question Relevance, and Ground Truth ranks highest across all criteria. System rankings are preserved across all three criteria, with the exception of a statistically non-significant difference in Answer Relevance. This holds true despite a severe drop in inter-annotator agreement for Readability. We further document several methodological concerns, some unreported in prior replications, including data quality issues and evaluation design limitations identified during our pilot study."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mroczek-etal-2026-repronlp">
<titleInfo>
<title>ReproNLP 2026: A Third Replication of the Human Evaluation of a QAG System for Children’s Storybooks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcel</namePart>
<namePart type="family">Mroczek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chiara</namePart>
<namePart type="family">Albarello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul-Emmanuel</namePart>
<namePart type="family">Floch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Gawinecki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Abstract: Reproducibility of human evaluations in Natural Language Processing remains a critical open challenge. This paper presents a third independent replication of the human evaluation from Yao et al. (2022), which assessed an automated Question-Answer Generation (QAG) system for children’s storybooks against a baseline system and human-authored ground truth, across three criteria — Readability, Question Relevance, and Answer Relevance — using five NLP-literate annotators. Our replication confirms the main findings of the original study: the QAG system outperforms the baseline on Readability and Question Relevance, and Ground Truth ranks highest across all criteria. System rankings are preserved across all three criteria, with the exception of a statistically non-significant difference in Answer Relevance. This holds true despite a severe drop in inter-annotator agreement for Readability. We further document several methodological concerns, some unreported in prior replications, including data quality issues and evaluation design limitations identified during our pilot study.</abstract>
<identifier type="citekey">mroczek-etal-2026-repronlp</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.85/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1082</start>
<end>1093</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproNLP 2026: A Third Replication of the Human Evaluation of a QAG System for Children’s Storybooks
%A Mroczek, Marcel
%A Albarello, Chiara
%A Floch, Paul-Emmanuel
%A Gawinecki, Maciej
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F mroczek-etal-2026-repronlp
%X Abstract: Reproducibility of human evaluations in Natural Language Processing remains a critical open challenge. This paper presents a third independent replication of the human evaluation from Yao et al. (2022), which assessed an automated Question-Answer Generation (QAG) system for children’s storybooks against a baseline system and human-authored ground truth, across three criteria — Readability, Question Relevance, and Answer Relevance — using five NLP-literate annotators. Our replication confirms the main findings of the original study: the QAG system outperforms the baseline on Readability and Question Relevance, and Ground Truth ranks highest across all criteria. System rankings are preserved across all three criteria, with the exception of a statistically non-significant difference in Answer Relevance. This holds true despite a severe drop in inter-annotator agreement for Readability. We further document several methodological concerns, some unreported in prior replications, including data quality issues and evaluation design limitations identified during our pilot study.
%U https://aclanthology.org/2026.gem-main.85/
%P 1082-1093
Markdown (Informal)
[ReproNLP 2026: A Third Replication of the Human Evaluation of a QAG System for Children’s Storybooks](https://aclanthology.org/2026.gem-main.85/) (Mroczek et al., GEM 2026)
ACL