@inproceedings{hong-etal-2024-large,
title = "Do large language models and humans have similar behaviours in causal inference with script knowledge?",
author = "Hong, Xudong and
Ryzhova, Margarita and
Biondi, Daniel and
Demberg, Vera",
editor = "Bollegala, Danushka and
Shwartz, Vered",
booktitle = "Proceedings of the 13th Joint Conference on Lexical and Computational Semantics (*SEM 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.starsem-1.34",
doi = "10.18653/v1/2024.starsem-1.34",
pages = "421--437",
abstract = "Recently, large pre-trained language models (LLMs) have demonstrated superior language understanding abilities, including zero-shot causal reasoning. However, it is unclear to what extent their capabilities are similar to human ones. We here study the processing of an event $B$ in a script-based story, which causally depends on a previous event $A$. In our manipulation, event $A$ is stated, negated, or omitted in an earlier section of the text. We first conducted a self-paced reading experiment, which showed that humans exhibit significantly longer reading times when causal conflicts exist ($\neg A \rightarrow B$) than under logical conditions ($A \rightarrow B$). However, reading times remain similar when cause A is not explicitly mentioned, indicating that humans can easily infer event B from their script knowledge. We then tested a variety of LLMs on the same data to check to what extent the models replicate human behavior. Our experiments show that 1) only recent LLMs, like GPT-3 or Vicuna, correlate with human behavior in the $\neg A \rightarrow B$ condition. 2) Despite this correlation, all models still fail to predict that $nil \rightarrow B$ is less surprising than $\neg A \rightarrow B$, indicating that LLMs still have difficulties integrating script knowledge.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hong-etal-2024-large">
<titleInfo>
<title>Do large language models and humans have similar behaviours in causal inference with script knowledge?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xudong</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Margarita</namePart>
<namePart type="family">Ryzhova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Biondi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Joint Conference on Lexical and Computational Semantics (*SEM 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Danushka</namePart>
<namePart type="family">Bollegala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vered</namePart>
<namePart type="family">Shwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, large pre-trained language models (LLMs) have demonstrated superior language understanding abilities, including zero-shot causal reasoning. However, it is unclear to what extent their capabilities are similar to human ones. We here study the processing of an event B in a script-based story, which causally depends on a previous event A. In our manipulation, event A is stated, negated, or omitted in an earlier section of the text. We first conducted a self-paced reading experiment, which showed that humans exhibit significantly longer reading times when causal conflicts exist (ʼneg A \rightarrow B) than under logical conditions (A \rightarrow B). However, reading times remain similar when cause A is not explicitly mentioned, indicating that humans can easily infer event B from their script knowledge. We then tested a variety of LLMs on the same data to check to what extent the models replicate human behavior. Our experiments show that 1) only recent LLMs, like GPT-3 or Vicuna, correlate with human behavior in the ʼneg A \rightarrow B condition. 2) Despite this correlation, all models still fail to predict that nil \rightarrow B is less surprising than ʼneg A \rightarrow B, indicating that LLMs still have difficulties integrating script knowledge.</abstract>
<identifier type="citekey">hong-etal-2024-large</identifier>
<identifier type="doi">10.18653/v1/2024.starsem-1.34</identifier>
<location>
<url>https://aclanthology.org/2024.starsem-1.34</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>421</start>
<end>437</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do large language models and humans have similar behaviours in causal inference with script knowledge?
%A Hong, Xudong
%A Ryzhova, Margarita
%A Biondi, Daniel
%A Demberg, Vera
%Y Bollegala, Danushka
%Y Shwartz, Vered
%S Proceedings of the 13th Joint Conference on Lexical and Computational Semantics (*SEM 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F hong-etal-2024-large
%X Recently, large pre-trained language models (LLMs) have demonstrated superior language understanding abilities, including zero-shot causal reasoning. However, it is unclear to what extent their capabilities are similar to human ones. We here study the processing of an event B in a script-based story, which causally depends on a previous event A. In our manipulation, event A is stated, negated, or omitted in an earlier section of the text. We first conducted a self-paced reading experiment, which showed that humans exhibit significantly longer reading times when causal conflicts exist (ʼneg A \rightarrow B) than under logical conditions (A \rightarrow B). However, reading times remain similar when cause A is not explicitly mentioned, indicating that humans can easily infer event B from their script knowledge. We then tested a variety of LLMs on the same data to check to what extent the models replicate human behavior. Our experiments show that 1) only recent LLMs, like GPT-3 or Vicuna, correlate with human behavior in the ʼneg A \rightarrow B condition. 2) Despite this correlation, all models still fail to predict that nil \rightarrow B is less surprising than ʼneg A \rightarrow B, indicating that LLMs still have difficulties integrating script knowledge.
%R 10.18653/v1/2024.starsem-1.34
%U https://aclanthology.org/2024.starsem-1.34
%U https://doi.org/10.18653/v1/2024.starsem-1.34
%P 421-437
Markdown (Informal)
[Do large language models and humans have similar behaviours in causal inference with script knowledge?](https://aclanthology.org/2024.starsem-1.34) (Hong et al., *SEM 2024)
ACL