@inproceedings{mahamood-2026-reprohum,
title = "{R}epro{H}um {\#}0669-08: Reproducing a Recipe for Arbitrary Text Style Transfer with {LLM}s",
author = "Mahamood, Saad",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.90/",
pages = "1127--1132",
ISBN = "979-8-89176-423-1",
abstract = "We describe our attempt to reproduce a single human evaluation quality criterion that was conducted in the paper ``Reproducing a Recipe for Arbitrary Text Style Transfer with LLMs''. This paper describes the approach and challenges involved in reproducing the human evaluation as done by the original authors. In particular, we describe negative results obtained during the reproduction, and we compare our results with an earlier reproduction for the same experiment. Finally, we describe the insights we gained from attempting this particular reproduction and the barriers that remain in attempting successful reproductions. The results and insights presented will hopefully enable the broader NLP research community to improve both how human evaluations are conducted and enable better reproducibility of NLP experiments in the future."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mahamood-2026-reprohum">
<titleInfo>
<title>ReproHum #0669-08: Reproducing a Recipe for Arbitrary Text Style Transfer with LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>We describe our attempt to reproduce a single human evaluation quality criterion that was conducted in the paper “Reproducing a Recipe for Arbitrary Text Style Transfer with LLMs”. This paper describes the approach and challenges involved in reproducing the human evaluation as done by the original authors. In particular, we describe negative results obtained during the reproduction, and we compare our results with an earlier reproduction for the same experiment. Finally, we describe the insights we gained from attempting this particular reproduction and the barriers that remain in attempting successful reproductions. The results and insights presented will hopefully enable the broader NLP research community to improve both how human evaluations are conducted and enable better reproducibility of NLP experiments in the future.</abstract>
<identifier type="citekey">mahamood-2026-reprohum</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.90/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1127</start>
<end>1132</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproHum #0669-08: Reproducing a Recipe for Arbitrary Text Style Transfer with LLMs
%A Mahamood, Saad
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F mahamood-2026-reprohum
%X We describe our attempt to reproduce a single human evaluation quality criterion that was conducted in the paper “Reproducing a Recipe for Arbitrary Text Style Transfer with LLMs”. This paper describes the approach and challenges involved in reproducing the human evaluation as done by the original authors. In particular, we describe negative results obtained during the reproduction, and we compare our results with an earlier reproduction for the same experiment. Finally, we describe the insights we gained from attempting this particular reproduction and the barriers that remain in attempting successful reproductions. The results and insights presented will hopefully enable the broader NLP research community to improve both how human evaluations are conducted and enable better reproducibility of NLP experiments in the future.
%U https://aclanthology.org/2026.gem-main.90/
%P 1127-1132
Markdown (Informal)
[ReproHum #0669-08: Reproducing a Recipe for Arbitrary Text Style Transfer with LLMs](https://aclanthology.org/2026.gem-main.90/) (Mahamood, GEM 2026)
ACL