@inproceedings{li-etal-2024-reprohum,
title = "{R}epro{H}um {\#}0033-3: Comparable Relative Results with Lower Absolute Values in a Reproduction Study",
author = "Li, Yiru and
Lai, Huiyuan and
Toral, Antonio and
Nissim, Malvina",
editor = "Balloccu, Simone and
Belz, Anya and
Huidrom, Rudali and
Reiter, Ehud and
Sedoc, Joao and
Thomson, Craig",
booktitle = "Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.humeval-1.21",
pages = "238--249",
abstract = "In the context of the ReproHum project aimed at assessing the reliability of human evaluation, we replicated the human evaluation conducted in {``}Generating Scientific Definitions with Controllable Complexity{''} by August et al. (2022). Specifically, humans were asked to assess the fluency of automatically generated scientific definitions by three different models, with output complexity varying according to target audience. Evaluation conditions were kept as close as possible to the original study, except of necessary and minor adjustments. Our results, despite yielding lower absolute performance, show that relative performance across the three tested systems remains comparable to what was observed in the original paper. On the basis of lower inter-annotator agreement and feedback received from annotators in our experiment, we also observe that the ambiguity of the concept being evaluated may play a substantial role in human assessment.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2024-reprohum">
<titleInfo>
<title>ReproHum #0033-3: Comparable Relative Results with Lower Absolute Values in a Reproduction Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiru</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huiyuan</namePart>
<namePart type="family">Lai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="family">Toral</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malvina</namePart>
<namePart type="family">Nissim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simone</namePart>
<namePart type="family">Balloccu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudali</namePart>
<namePart type="family">Huidrom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joao</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In the context of the ReproHum project aimed at assessing the reliability of human evaluation, we replicated the human evaluation conducted in “Generating Scientific Definitions with Controllable Complexity” by August et al. (2022). Specifically, humans were asked to assess the fluency of automatically generated scientific definitions by three different models, with output complexity varying according to target audience. Evaluation conditions were kept as close as possible to the original study, except of necessary and minor adjustments. Our results, despite yielding lower absolute performance, show that relative performance across the three tested systems remains comparable to what was observed in the original paper. On the basis of lower inter-annotator agreement and feedback received from annotators in our experiment, we also observe that the ambiguity of the concept being evaluated may play a substantial role in human assessment.</abstract>
<identifier type="citekey">li-etal-2024-reprohum</identifier>
<location>
<url>https://aclanthology.org/2024.humeval-1.21</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>238</start>
<end>249</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ReproHum #0033-3: Comparable Relative Results with Lower Absolute Values in a Reproduction Study
%A Li, Yiru
%A Lai, Huiyuan
%A Toral, Antonio
%A Nissim, Malvina
%Y Balloccu, Simone
%Y Belz, Anya
%Y Huidrom, Rudali
%Y Reiter, Ehud
%Y Sedoc, Joao
%Y Thomson, Craig
%S Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F li-etal-2024-reprohum
%X In the context of the ReproHum project aimed at assessing the reliability of human evaluation, we replicated the human evaluation conducted in “Generating Scientific Definitions with Controllable Complexity” by August et al. (2022). Specifically, humans were asked to assess the fluency of automatically generated scientific definitions by three different models, with output complexity varying according to target audience. Evaluation conditions were kept as close as possible to the original study, except of necessary and minor adjustments. Our results, despite yielding lower absolute performance, show that relative performance across the three tested systems remains comparable to what was observed in the original paper. On the basis of lower inter-annotator agreement and feedback received from annotators in our experiment, we also observe that the ambiguity of the concept being evaluated may play a substantial role in human assessment.
%U https://aclanthology.org/2024.humeval-1.21
%P 238-249
Markdown (Informal)
[ReproHum #0033-3: Comparable Relative Results with Lower Absolute Values in a Reproduction Study](https://aclanthology.org/2024.humeval-1.21) (Li et al., HumEval-WS 2024)
ACL