@inproceedings{moessner-adel-2025-human,
title = "Human vs. {AI}: A Novel Benchmark and a Comparative Study on the Detection of Generated Images and the Impact of Prompts",
author = "Moe{\ss}ner, Philipp and
Adel, Heike",
editor = "Alam, Firoj and
Nakov, Preslav and
Habash, Nizar and
Gurevych, Iryna and
Chowdhury, Shammur and
Shelmanov, Artem and
Wang, Yuxia and
Artemova, Ekaterina and
Kutlu, Mucahid and
Mikros, George",
booktitle = "Proceedings of the 1stWorkshop on GenAI Content Detection (GenAIDetect)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2025.genaidetect-1.2/",
pages = "47--58",
abstract = "With the advent of publicly available AI-based text-to-image systems, the process of creating photorealistic but fully synthetic images has been largely democratized. This can pose a threat to the public through a simplified spread of disinformation. Machine detectors and human media expertise can help to differentiate between AI-generated (fake) and real images and counteract this danger. Although AI generation models are highly prompt-dependent, the impact of the prompt on the fake detection performance has rarely been investigated yet. This work therefore examines the influence of the prompt`s level of detail on the detectability of fake images, both with an AI detector and in a user study. For this purpose, we create a novel dataset, COCOXGEN, which consists of real photos from the COCO dataset as well as images generated with SDXL and Fooocus using prompts of two standardized lengths. Our user study with 200 participants shows that images generated with longer, more detailed prompts are detected significantly more easily than those generated with short prompts. Similarly, an AI-based detection model achieves better performance on images generated with longer prompts. However, humans and AI models seem to pay attention to different details, as we show in a heat map analysis."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moessner-adel-2025-human">
<titleInfo>
<title>Human vs. AI: A Novel Benchmark and a Comparative Study on the Detection of Generated Images and the Impact of Prompts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Moeßner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heike</namePart>
<namePart type="family">Adel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1stWorkshop on GenAI Content Detection (GenAIDetect)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Firoj</namePart>
<namePart type="family">Alam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shammur</namePart>
<namePart type="family">Chowdhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Shelmanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxia</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mucahid</namePart>
<namePart type="family">Kutlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Mikros</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the advent of publicly available AI-based text-to-image systems, the process of creating photorealistic but fully synthetic images has been largely democratized. This can pose a threat to the public through a simplified spread of disinformation. Machine detectors and human media expertise can help to differentiate between AI-generated (fake) and real images and counteract this danger. Although AI generation models are highly prompt-dependent, the impact of the prompt on the fake detection performance has rarely been investigated yet. This work therefore examines the influence of the prompt‘s level of detail on the detectability of fake images, both with an AI detector and in a user study. For this purpose, we create a novel dataset, COCOXGEN, which consists of real photos from the COCO dataset as well as images generated with SDXL and Fooocus using prompts of two standardized lengths. Our user study with 200 participants shows that images generated with longer, more detailed prompts are detected significantly more easily than those generated with short prompts. Similarly, an AI-based detection model achieves better performance on images generated with longer prompts. However, humans and AI models seem to pay attention to different details, as we show in a heat map analysis.</abstract>
<identifier type="citekey">moessner-adel-2025-human</identifier>
<location>
<url>https://aclanthology.org/2025.genaidetect-1.2/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>47</start>
<end>58</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Human vs. AI: A Novel Benchmark and a Comparative Study on the Detection of Generated Images and the Impact of Prompts
%A Moeßner, Philipp
%A Adel, Heike
%Y Alam, Firoj
%Y Nakov, Preslav
%Y Habash, Nizar
%Y Gurevych, Iryna
%Y Chowdhury, Shammur
%Y Shelmanov, Artem
%Y Wang, Yuxia
%Y Artemova, Ekaterina
%Y Kutlu, Mucahid
%Y Mikros, George
%S Proceedings of the 1stWorkshop on GenAI Content Detection (GenAIDetect)
%D 2025
%8 January
%I International Conference on Computational Linguistics
%C Abu Dhabi, UAE
%F moessner-adel-2025-human
%X With the advent of publicly available AI-based text-to-image systems, the process of creating photorealistic but fully synthetic images has been largely democratized. This can pose a threat to the public through a simplified spread of disinformation. Machine detectors and human media expertise can help to differentiate between AI-generated (fake) and real images and counteract this danger. Although AI generation models are highly prompt-dependent, the impact of the prompt on the fake detection performance has rarely been investigated yet. This work therefore examines the influence of the prompt‘s level of detail on the detectability of fake images, both with an AI detector and in a user study. For this purpose, we create a novel dataset, COCOXGEN, which consists of real photos from the COCO dataset as well as images generated with SDXL and Fooocus using prompts of two standardized lengths. Our user study with 200 participants shows that images generated with longer, more detailed prompts are detected significantly more easily than those generated with short prompts. Similarly, an AI-based detection model achieves better performance on images generated with longer prompts. However, humans and AI models seem to pay attention to different details, as we show in a heat map analysis.
%U https://aclanthology.org/2025.genaidetect-1.2/
%P 47-58
Markdown (Informal)
[Human vs. AI: A Novel Benchmark and a Comparative Study on the Detection of Generated Images and the Impact of Prompts](https://aclanthology.org/2025.genaidetect-1.2/) (Moeßner & Adel, GenAIDetect 2025)
ACL