@inproceedings{liu-etal-2024-temperature,
title = "Temperature-scaling surprisal estimates improve fit to human reading times {--} but does it do so for the {\textquotedblleft}right reasons{\textquotedblright}?",
author = "Liu, Tong and
{\v{S}}krjanec, Iza and
Demberg, Vera",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.519/",
doi = "10.18653/v1/2024.acl-long.519",
pages = "9598--9619",
abstract = "A wide body of evidence shows that human language processing difficulty is predicted by the information-theoretic measure surprisal, a word`s negative log probability in context. However, it is still unclear how to best estimate these probabilities needed for predicting human processing difficulty {--} while a long-standing belief held that models with lower perplexity would provide more accurate estimates of word predictability, and therefore lead to better reading time predictions, recent work has shown that for very large models, psycholinguistic predictive power decreases. One reason could be that language models might be more confident of their predictions than humans, because they have had exposure to several magnitudes more data. In this paper, we test what effect temperature-scaling of large language model (LLM) predictions has on surprisal estimates and their predictive power of reading times of English texts. Firstly, we show that calibration of large language models typically improves with model size, i.e. poorer calibration cannot account for poorer fit to reading times. Secondly, we find that temperature-scaling probabilities lead to a systematically better fit to reading times (up to 89{\%} improvement in delta log likelihood), across several reading time corpora. Finally, we show that this improvement in fit is chiefly driven by words that are composed of multiple subword tokens."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2024-temperature">
<titleInfo>
<title>Temperature-scaling surprisal estimates improve fit to human reading times – but does it do so for the “right reasons”?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iza</namePart>
<namePart type="family">Škrjanec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A wide body of evidence shows that human language processing difficulty is predicted by the information-theoretic measure surprisal, a word‘s negative log probability in context. However, it is still unclear how to best estimate these probabilities needed for predicting human processing difficulty – while a long-standing belief held that models with lower perplexity would provide more accurate estimates of word predictability, and therefore lead to better reading time predictions, recent work has shown that for very large models, psycholinguistic predictive power decreases. One reason could be that language models might be more confident of their predictions than humans, because they have had exposure to several magnitudes more data. In this paper, we test what effect temperature-scaling of large language model (LLM) predictions has on surprisal estimates and their predictive power of reading times of English texts. Firstly, we show that calibration of large language models typically improves with model size, i.e. poorer calibration cannot account for poorer fit to reading times. Secondly, we find that temperature-scaling probabilities lead to a systematically better fit to reading times (up to 89% improvement in delta log likelihood), across several reading time corpora. Finally, we show that this improvement in fit is chiefly driven by words that are composed of multiple subword tokens.</abstract>
<identifier type="citekey">liu-etal-2024-temperature</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.519</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.519/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>9598</start>
<end>9619</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Temperature-scaling surprisal estimates improve fit to human reading times – but does it do so for the “right reasons”?
%A Liu, Tong
%A Škrjanec, Iza
%A Demberg, Vera
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F liu-etal-2024-temperature
%X A wide body of evidence shows that human language processing difficulty is predicted by the information-theoretic measure surprisal, a word‘s negative log probability in context. However, it is still unclear how to best estimate these probabilities needed for predicting human processing difficulty – while a long-standing belief held that models with lower perplexity would provide more accurate estimates of word predictability, and therefore lead to better reading time predictions, recent work has shown that for very large models, psycholinguistic predictive power decreases. One reason could be that language models might be more confident of their predictions than humans, because they have had exposure to several magnitudes more data. In this paper, we test what effect temperature-scaling of large language model (LLM) predictions has on surprisal estimates and their predictive power of reading times of English texts. Firstly, we show that calibration of large language models typically improves with model size, i.e. poorer calibration cannot account for poorer fit to reading times. Secondly, we find that temperature-scaling probabilities lead to a systematically better fit to reading times (up to 89% improvement in delta log likelihood), across several reading time corpora. Finally, we show that this improvement in fit is chiefly driven by words that are composed of multiple subword tokens.
%R 10.18653/v1/2024.acl-long.519
%U https://aclanthology.org/2024.luhme-long.519/
%U https://doi.org/10.18653/v1/2024.acl-long.519
%P 9598-9619
Markdown (Informal)
[Temperature-scaling surprisal estimates improve fit to human reading times – but does it do so for the “right reasons”?](https://aclanthology.org/2024.luhme-long.519/) (Liu et al., ACL 2024)
ACL