@inproceedings{michaelov-etal-2025-quite,
title = "Not quite Sherlock Holmes: Language model predictions do not reliably differentiate impossible from improbable events",
author = "Michaelov, James A. and
Estacio, Reeka and
Zhang, Zhien and
Bergen, Ben",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.696/",
doi = "10.18653/v1/2025.findings-acl.696",
pages = "13528--13551",
ISBN = "979-8-89176-256-5",
abstract = "Can language models reliably predict that possible events are more likely than merely improbable ones? By teasing apart possibility, typicality, and contextual relatedness, we show that despite the results of previous work, language models' ability to do this is far from robust. In fact, under certain conditions, all models tested{---}including Llama 3, Gemma 2, and Mistral NeMo{---}perform at worse-than-chance level, assigning higher probabilities to impossible sentences such as `the car was given a parking ticket by the brake' than to merely unlikely sentences such as `the car was given a parking ticket by the explorer'."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="michaelov-etal-2025-quite">
<titleInfo>
<title>Not quite Sherlock Holmes: Language model predictions do not reliably differentiate impossible from improbable events</title>
</titleInfo>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Michaelov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reeka</namePart>
<namePart type="family">Estacio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhien</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ben</namePart>
<namePart type="family">Bergen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Can language models reliably predict that possible events are more likely than merely improbable ones? By teasing apart possibility, typicality, and contextual relatedness, we show that despite the results of previous work, language models’ ability to do this is far from robust. In fact, under certain conditions, all models tested—including Llama 3, Gemma 2, and Mistral NeMo—perform at worse-than-chance level, assigning higher probabilities to impossible sentences such as ‘the car was given a parking ticket by the brake’ than to merely unlikely sentences such as ‘the car was given a parking ticket by the explorer’.</abstract>
<identifier type="citekey">michaelov-etal-2025-quite</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.696</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.696/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>13528</start>
<end>13551</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Not quite Sherlock Holmes: Language model predictions do not reliably differentiate impossible from improbable events
%A Michaelov, James A.
%A Estacio, Reeka
%A Zhang, Zhien
%A Bergen, Ben
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F michaelov-etal-2025-quite
%X Can language models reliably predict that possible events are more likely than merely improbable ones? By teasing apart possibility, typicality, and contextual relatedness, we show that despite the results of previous work, language models’ ability to do this is far from robust. In fact, under certain conditions, all models tested—including Llama 3, Gemma 2, and Mistral NeMo—perform at worse-than-chance level, assigning higher probabilities to impossible sentences such as ‘the car was given a parking ticket by the brake’ than to merely unlikely sentences such as ‘the car was given a parking ticket by the explorer’.
%R 10.18653/v1/2025.findings-acl.696
%U https://aclanthology.org/2025.findings-acl.696/
%U https://doi.org/10.18653/v1/2025.findings-acl.696
%P 13528-13551
Markdown (Informal)
[Not quite Sherlock Holmes: Language model predictions do not reliably differentiate impossible from improbable events](https://aclanthology.org/2025.findings-acl.696/) (Michaelov et al., Findings 2025)
ACL