@inproceedings{schmidtova-etal-2025-eyes,
title = "Do My Eyes Deceive Me? A Survey of Human Evaluations of Hallucinations in {NLG}",
author = "Schmidtova, Patricia and
Cal{\`o}, Eduardo and
Balloccu, Simone and
Gkatzia, Dimitra and
Huidrom, Rudali and
Lango, Mateusz and
Same, Fahime and
Zouhar, Vil{\'e}m and
Mahamood, Saad and
Dusek, Ondrej",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.inlg-main.4/",
pages = "60--79",
abstract = "Hallucinations are one of the most pressing challenges for large language models (LLMs). While numerous methods have been proposed to detect and mitigate them automatically, human evaluation continues to serve as the gold standard. However, these human evaluations of hallucinations show substantial variation in definitions, terminology, and evaluation practices. In this paper, we survey 64 studies involving human evaluation of hallucination published between 2019 and 2024, to investigate how hallucinations are currently defined and assessed. Our analysis reveals a lack of consistency in definitions and exposes several concerning methodological shortcomings. Crucial details, such as evaluation guidelines, user interface design, inter-annotator agreement metrics, and annotator demographics, are frequently under-reported or omitted altogether."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="schmidtova-etal-2025-eyes">
<titleInfo>
<title>Do My Eyes Deceive Me? A Survey of Human Evaluations of Hallucinations in NLG</title>
</titleInfo>
<name type="personal">
<namePart type="given">Patricia</namePart>
<namePart type="family">Schmidtova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduardo</namePart>
<namePart type="family">Calò</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simone</namePart>
<namePart type="family">Balloccu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dimitra</namePart>
<namePart type="family">Gkatzia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rudali</namePart>
<namePart type="family">Huidrom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mateusz</namePart>
<namePart type="family">Lango</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fahime</namePart>
<namePart type="family">Same</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vilém</namePart>
<namePart type="family">Zouhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Dusek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Narayan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lê</namePart>
<namePart type="given">Hồng</namePart>
<namePart type="family">Phương</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahuan</namePart>
<namePart type="family">Pei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hanoi, Vietnam</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Hallucinations are one of the most pressing challenges for large language models (LLMs). While numerous methods have been proposed to detect and mitigate them automatically, human evaluation continues to serve as the gold standard. However, these human evaluations of hallucinations show substantial variation in definitions, terminology, and evaluation practices. In this paper, we survey 64 studies involving human evaluation of hallucination published between 2019 and 2024, to investigate how hallucinations are currently defined and assessed. Our analysis reveals a lack of consistency in definitions and exposes several concerning methodological shortcomings. Crucial details, such as evaluation guidelines, user interface design, inter-annotator agreement metrics, and annotator demographics, are frequently under-reported or omitted altogether.</abstract>
<identifier type="citekey">schmidtova-etal-2025-eyes</identifier>
<location>
<url>https://aclanthology.org/2025.inlg-main.4/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>60</start>
<end>79</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do My Eyes Deceive Me? A Survey of Human Evaluations of Hallucinations in NLG
%A Schmidtova, Patricia
%A Calò, Eduardo
%A Balloccu, Simone
%A Gkatzia, Dimitra
%A Huidrom, Rudali
%A Lango, Mateusz
%A Same, Fahime
%A Zouhar, Vilém
%A Mahamood, Saad
%A Dusek, Ondrej
%Y Flek, Lucie
%Y Narayan, Shashi
%Y Phương, Lê Hồng
%Y Pei, Jiahuan
%S Proceedings of the 18th International Natural Language Generation Conference
%D 2025
%8 October
%I Association for Computational Linguistics
%C Hanoi, Vietnam
%F schmidtova-etal-2025-eyes
%X Hallucinations are one of the most pressing challenges for large language models (LLMs). While numerous methods have been proposed to detect and mitigate them automatically, human evaluation continues to serve as the gold standard. However, these human evaluations of hallucinations show substantial variation in definitions, terminology, and evaluation practices. In this paper, we survey 64 studies involving human evaluation of hallucination published between 2019 and 2024, to investigate how hallucinations are currently defined and assessed. Our analysis reveals a lack of consistency in definitions and exposes several concerning methodological shortcomings. Crucial details, such as evaluation guidelines, user interface design, inter-annotator agreement metrics, and annotator demographics, are frequently under-reported or omitted altogether.
%U https://aclanthology.org/2025.inlg-main.4/
%P 60-79
Markdown (Informal)
[Do My Eyes Deceive Me? A Survey of Human Evaluations of Hallucinations in NLG](https://aclanthology.org/2025.inlg-main.4/) (Schmidtova et al., INLG 2025)
ACL
- Patricia Schmidtova, Eduardo Calò, Simone Balloccu, Dimitra Gkatzia, Rudali Huidrom, Mateusz Lango, Fahime Same, Vilém Zouhar, Saad Mahamood, and Ondrej Dusek. 2025. Do My Eyes Deceive Me? A Survey of Human Evaluations of Hallucinations in NLG. In Proceedings of the 18th International Natural Language Generation Conference, pages 60–79, Hanoi, Vietnam. Association for Computational Linguistics.