@inproceedings{arvan-parde-2025-reprohum,
title = "{R}epro{H}um: {\#}0744-02: Investigating the Reproducibility of Semantic Preservation Human Evaluations",
author = "Arvan, Mohammad and
Parde, Natalie",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.54/",
pages = "590--600",
ISBN = "979-8-89176-261-9",
abstract = "Reproducibility remains a fundamental challenge for human evaluation in Natural Language Processing (NLP), particularly due to the inherent subjectivity and variability of human judgments. This paper presents a reproduction study of the human evaluation protocol introduced by Hosking and Lapata (2021), which assesses semantic preservation in paraphrase generation models. By faithfully reproducing the original experiment with careful adaptation and applying the Quantified Reproducibility Assessment framework (Belz and Thomson, 2024a; Belz, 2022), we demonstrate strong agreement with the original findings, confirming the semantic preservation ranking among four paraphrase models. Our analyses reveal moderate inter-annotator agreement and low variability in key results, underscoring a good degree of reproducibility despite practical deviations in participant recruitment and platform. These findings highlight the feasibility and challenges of reproducing human evaluation studies in NLP. We discuss implications for improving methodological rigor, transparent reporting, and standardized protocols to bolster reproducibility in future human evaluations. The data and analysis scripts are publicly available to support ongoing community efforts toward reproducible evaluation in NLP and beyond."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arvan-parde-2025-reprohum">
<titleInfo>
<title>ReproHum: #0744-02: Investigating the Reproducibility of Semantic Preservation Human Evaluations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Arvan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Parde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>Reproducibility remains a fundamental challenge for human evaluation in Natural Language Processing (NLP), particularly due to the inherent subjectivity and variability of human judgments. This paper presents a reproduction study of the human evaluation protocol introduced by Hosking and Lapata (2021), which assesses semantic preservation in paraphrase generation models. By faithfully reproducing the original experiment with careful adaptation and applying the Quantified Reproducibility Assessment framework (Belz and Thomson, 2024a; Belz, 2022), we demonstrate strong agreement with the original findings, confirming the semantic preservation ranking among four paraphrase models. Our analyses reveal moderate inter-annotator agreement and low variability in key results, underscoring a good degree of reproducibility despite practical deviations in participant recruitment and platform. These findings highlight the feasibility and challenges of reproducing human evaluation studies in NLP. We discuss implications for improving methodological rigor, transparent reporting, and standardized protocols to bolster reproducibility in future human evaluations. The data and analysis scripts are publicly available to support ongoing community efforts toward reproducible evaluation in NLP and beyond.</abstract>
<identifier type="citekey">arvan-parde-2025-reprohum</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.54/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>590</start>
<end>600</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproHum: #0744-02: Investigating the Reproducibility of Semantic Preservation Human Evaluations
%A Arvan, Mohammad
%A Parde, Natalie
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F arvan-parde-2025-reprohum
%X Reproducibility remains a fundamental challenge for human evaluation in Natural Language Processing (NLP), particularly due to the inherent subjectivity and variability of human judgments. This paper presents a reproduction study of the human evaluation protocol introduced by Hosking and Lapata (2021), which assesses semantic preservation in paraphrase generation models. By faithfully reproducing the original experiment with careful adaptation and applying the Quantified Reproducibility Assessment framework (Belz and Thomson, 2024a; Belz, 2022), we demonstrate strong agreement with the original findings, confirming the semantic preservation ranking among four paraphrase models. Our analyses reveal moderate inter-annotator agreement and low variability in key results, underscoring a good degree of reproducibility despite practical deviations in participant recruitment and platform. These findings highlight the feasibility and challenges of reproducing human evaluation studies in NLP. We discuss implications for improving methodological rigor, transparent reporting, and standardized protocols to bolster reproducibility in future human evaluations. The data and analysis scripts are publicly available to support ongoing community efforts toward reproducible evaluation in NLP and beyond.
%U https://aclanthology.org/2025.gem-1.54/
%P 590-600
Markdown (Informal)
[ReproHum: #0744-02: Investigating the Reproducibility of Semantic Preservation Human Evaluations](https://aclanthology.org/2025.gem-1.54/) (Arvan & Parde, GEM 2025)
ACL