@inproceedings{junker-2025-reprohum,
title = "{R}epro{H}um {\#}0729-04: Human Evaluation Reproduction Report for ``{M}em{S}um: Extractive Summarization of Long Documents Using Multi-Step Episodic {M}arkov Decision Processes''",
author = "Junker, Simeon",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.50/",
pages = "561--567",
ISBN = "979-8-89176-261-9",
abstract = "Human evaluation is indispensable in natural language processing (NLP), as automatic metrics are known to not always align well with human judgments.However, the reproducibility of human evaluations can be problematic since results are susceptible to many factors, the details of which are often missing from the respective works.As part of the ReproHum project, this work aims to reproduce the human evaluation of a single criterion in the paper ``MemSum: Extractive Summarization of Long Documents Using Multi-Step Episodic Markov Decision Processes'' (Gu et al, 2022).The results of our reproduction differ noticeably from those of the original study. To explain this discrepancy, we discuss differences in the experimental setup, as well as more general characteristics of the selected domain and the generated summaries."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="junker-2025-reprohum">
<titleInfo>
<title>ReproHum #0729-04: Human Evaluation Reproduction Report for “MemSum: Extractive Summarization of Long Documents Using Multi-Step Episodic Markov Decision Processes”</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simeon</namePart>
<namePart type="family">Junker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>Human evaluation is indispensable in natural language processing (NLP), as automatic metrics are known to not always align well with human judgments.However, the reproducibility of human evaluations can be problematic since results are susceptible to many factors, the details of which are often missing from the respective works.As part of the ReproHum project, this work aims to reproduce the human evaluation of a single criterion in the paper “MemSum: Extractive Summarization of Long Documents Using Multi-Step Episodic Markov Decision Processes” (Gu et al, 2022).The results of our reproduction differ noticeably from those of the original study. To explain this discrepancy, we discuss differences in the experimental setup, as well as more general characteristics of the selected domain and the generated summaries.</abstract>
<identifier type="citekey">junker-2025-reprohum</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.50/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>561</start>
<end>567</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproHum #0729-04: Human Evaluation Reproduction Report for “MemSum: Extractive Summarization of Long Documents Using Multi-Step Episodic Markov Decision Processes”
%A Junker, Simeon
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F junker-2025-reprohum
%X Human evaluation is indispensable in natural language processing (NLP), as automatic metrics are known to not always align well with human judgments.However, the reproducibility of human evaluations can be problematic since results are susceptible to many factors, the details of which are often missing from the respective works.As part of the ReproHum project, this work aims to reproduce the human evaluation of a single criterion in the paper “MemSum: Extractive Summarization of Long Documents Using Multi-Step Episodic Markov Decision Processes” (Gu et al, 2022).The results of our reproduction differ noticeably from those of the original study. To explain this discrepancy, we discuss differences in the experimental setup, as well as more general characteristics of the selected domain and the generated summaries.
%U https://aclanthology.org/2025.gem-1.50/
%P 561-567
Markdown (Informal)
[ReproHum #0729-04: Human Evaluation Reproduction Report for “MemSum: Extractive Summarization of Long Documents Using Multi-Step Episodic Markov Decision Processes”](https://aclanthology.org/2025.gem-1.50/) (Junker, GEM 2025)
ACL