@inproceedings{fan-chen-2026-reprohum,
title = "{R}epro{H}um {\#}0866-04: Variability in Human Judgments of Sociopolitical Acceptability Across Studies",
author = "Fan, Rui and
Chen, Guanyi",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.87/",
pages = "1104--1110",
ISBN = "979-8-89176-423-1",
abstract = "Human evaluations are essential for assessing NLP systems, but their reproducibility can be limited when judgments involve socially sensitive constructs. This paper reproduces the perceived sociopolitical acceptability evaluation in (CITATION), where annotators judged whether model-generated writer-intent implications reflected mainstream or fringe viewpoints. Using the same 600 headline{--}belief pairs, we collected new annotations on Prolific and compared our results with both the original study and a prior reproduction. Our scores are lower than the original results. Under a 70{\%} threshold, these findings do not support the original conclusion that most generations were socially acceptable. Overall, our results align more closely with the prior reproduction, while also showing substantial variability, especially for GPT2-large. We argue that this variability may arise from a combination of platform differences, task framing, topic effects, and changes in social context over time. These findings highlight the importance of reporting not only annotation results, but also the evaluation setting in which subjective social judgments are collected."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fan-chen-2026-reprohum">
<titleInfo>
<title>ReproHum #0866-04: Variability in Human Judgments of Sociopolitical Acceptability Across Studies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rui</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanyi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Human evaluations are essential for assessing NLP systems, but their reproducibility can be limited when judgments involve socially sensitive constructs. This paper reproduces the perceived sociopolitical acceptability evaluation in (CITATION), where annotators judged whether model-generated writer-intent implications reflected mainstream or fringe viewpoints. Using the same 600 headline–belief pairs, we collected new annotations on Prolific and compared our results with both the original study and a prior reproduction. Our scores are lower than the original results. Under a 70% threshold, these findings do not support the original conclusion that most generations were socially acceptable. Overall, our results align more closely with the prior reproduction, while also showing substantial variability, especially for GPT2-large. We argue that this variability may arise from a combination of platform differences, task framing, topic effects, and changes in social context over time. These findings highlight the importance of reporting not only annotation results, but also the evaluation setting in which subjective social judgments are collected.</abstract>
<identifier type="citekey">fan-chen-2026-reprohum</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.87/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1104</start>
<end>1110</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproHum #0866-04: Variability in Human Judgments of Sociopolitical Acceptability Across Studies
%A Fan, Rui
%A Chen, Guanyi
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F fan-chen-2026-reprohum
%X Human evaluations are essential for assessing NLP systems, but their reproducibility can be limited when judgments involve socially sensitive constructs. This paper reproduces the perceived sociopolitical acceptability evaluation in (CITATION), where annotators judged whether model-generated writer-intent implications reflected mainstream or fringe viewpoints. Using the same 600 headline–belief pairs, we collected new annotations on Prolific and compared our results with both the original study and a prior reproduction. Our scores are lower than the original results. Under a 70% threshold, these findings do not support the original conclusion that most generations were socially acceptable. Overall, our results align more closely with the prior reproduction, while also showing substantial variability, especially for GPT2-large. We argue that this variability may arise from a combination of platform differences, task framing, topic effects, and changes in social context over time. These findings highlight the importance of reporting not only annotation results, but also the evaluation setting in which subjective social judgments are collected.
%U https://aclanthology.org/2026.gem-main.87/
%P 1104-1110
Markdown (Informal)
[ReproHum #0866-04: Variability in Human Judgments of Sociopolitical Acceptability Across Studies](https://aclanthology.org/2026.gem-main.87/) (Fan & Chen, GEM 2026)
ACL