@inproceedings{ghaffari-krishnaswamy-2026-now,
title = "Now They See It, Now They Don{'}t: Multimodal Reward Models Exhibit Unreliability in Physical World Constraints",
author = "Ghaffari, Sadaf and
Krishnaswamy, Nikhil",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.conll-main.20/",
pages = "344--357",
ISBN = "979-8-89176-410-1",
abstract = "Generative AI systems, especially those driven by autoregressive and diffusion-based models, are known to struggle with spatial reasoning. As such, it becomes critical to understand how humans regard those failure modes. In this paper, we examine how humans judge different types of errors in images generated by a text-to-image model. We curated prompts that described common household objects with variance in number, spatial relations, and orientations, and generated a variety of images using each prompt. Humans observed pairs of images generated using the same prompt and answered a set of systematic questions about each image. Survey results showed that incorrect spatial *orientation* regularly emerges as a reason that the generated images do not accurately represent the prompt. We further investigated how RLHF-based multimodal reward models score prompt-image alignment over the same data, and whether they can reliably distinguish the better image in a pairwise setting, as humans do. We find that even though a general cross-task reward model may output alignment scores that accord with those of humans, its reasoning traces are flawed with respect to spatial orientational and relational indicators{---}the very factors that human annotators rated as the most consequential errors in generated images. Our results show that human annotators regard spatial reasoning errors as highly impactful on the correctness of generated images, and undermine the reliability of multimodal reward model scores as a baseline for evaluating image quality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ghaffari-krishnaswamy-2026-now">
<titleInfo>
<title>Now They See It, Now They Don’t: Multimodal Reward Models Exhibit Unreliability in Physical World Constraints</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sadaf</namePart>
<namePart type="family">Ghaffari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Krishnaswamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 30th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevgeni</namePart>
<namePart type="family">Berzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-410-1</identifier>
</relatedItem>
<abstract>Generative AI systems, especially those driven by autoregressive and diffusion-based models, are known to struggle with spatial reasoning. As such, it becomes critical to understand how humans regard those failure modes. In this paper, we examine how humans judge different types of errors in images generated by a text-to-image model. We curated prompts that described common household objects with variance in number, spatial relations, and orientations, and generated a variety of images using each prompt. Humans observed pairs of images generated using the same prompt and answered a set of systematic questions about each image. Survey results showed that incorrect spatial *orientation* regularly emerges as a reason that the generated images do not accurately represent the prompt. We further investigated how RLHF-based multimodal reward models score prompt-image alignment over the same data, and whether they can reliably distinguish the better image in a pairwise setting, as humans do. We find that even though a general cross-task reward model may output alignment scores that accord with those of humans, its reasoning traces are flawed with respect to spatial orientational and relational indicators—the very factors that human annotators rated as the most consequential errors in generated images. Our results show that human annotators regard spatial reasoning errors as highly impactful on the correctness of generated images, and undermine the reliability of multimodal reward model scores as a baseline for evaluating image quality.</abstract>
<identifier type="citekey">ghaffari-krishnaswamy-2026-now</identifier>
<location>
<url>https://aclanthology.org/2026.conll-main.20/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>344</start>
<end>357</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Now They See It, Now They Don’t: Multimodal Reward Models Exhibit Unreliability in Physical World Constraints
%A Ghaffari, Sadaf
%A Krishnaswamy, Nikhil
%Y Bonial, Claire
%Y Berzak, Yevgeni
%S Proceedings of the 30th Conference on Computational Natural Language Learning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-410-1
%F ghaffari-krishnaswamy-2026-now
%X Generative AI systems, especially those driven by autoregressive and diffusion-based models, are known to struggle with spatial reasoning. As such, it becomes critical to understand how humans regard those failure modes. In this paper, we examine how humans judge different types of errors in images generated by a text-to-image model. We curated prompts that described common household objects with variance in number, spatial relations, and orientations, and generated a variety of images using each prompt. Humans observed pairs of images generated using the same prompt and answered a set of systematic questions about each image. Survey results showed that incorrect spatial *orientation* regularly emerges as a reason that the generated images do not accurately represent the prompt. We further investigated how RLHF-based multimodal reward models score prompt-image alignment over the same data, and whether they can reliably distinguish the better image in a pairwise setting, as humans do. We find that even though a general cross-task reward model may output alignment scores that accord with those of humans, its reasoning traces are flawed with respect to spatial orientational and relational indicators—the very factors that human annotators rated as the most consequential errors in generated images. Our results show that human annotators regard spatial reasoning errors as highly impactful on the correctness of generated images, and undermine the reliability of multimodal reward model scores as a baseline for evaluating image quality.
%U https://aclanthology.org/2026.conll-main.20/
%P 344-357
Markdown (Informal)
[Now They See It, Now They Don’t: Multimodal Reward Models Exhibit Unreliability in Physical World Constraints](https://aclanthology.org/2026.conll-main.20/) (Ghaffari & Krishnaswamy, CoNLL 2026)
ACL