@inproceedings{wilcox-etal-2023-language,
title = "Language Model Quality Correlates with Psychometric Predictive Power in Multiple Languages",
author = "Wilcox, Ethan and
Meister, Clara and
Cotterell, Ryan and
Pimentel, Tiago",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.466",
doi = "10.18653/v1/2023.emnlp-main.466",
pages = "7503--7511",
abstract = "Surprisal theory (Hale, 2001; Levy, 2008) posits that a word{'}s reading time is proportional to its surprisal (i.e., to its negative log probability given the proceeding context). Since we are unable to access a word{'}s ground-truth probability, surprisal theory has been empirically tested using surprisal estimates from language models (LMs). Under the premise that surprisal theory holds, we would expect that higher quality language models provide more powerful predictors of human reading behavior{---}a conjecture we dub the quality{--}power (QP) hypothesis. Unfortunately, empirical support for the QP hypothesis is mixed. Some studies in English have found correlations between LM quality and predictive power, but other studies using Japanese data, as well as using larger English LMs, find no such correlations. In this work, we conduct a systematic crosslinguistic assessment of the QP hypothesis. We train LMs from scratch on small- and medium-sized datasets from 13 languages (across five language families) and assess their ability to predict eye tracking data. We find correlations between LM quality and power in eleven of these thirteen languages, suggesting that, within the range of model classes and sizes tested, better language models are indeed better predictors of human language processing behaviors.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wilcox-etal-2023-language">
<titleInfo>
<title>Language Model Quality Correlates with Psychometric Predictive Power in Multiple Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ethan</namePart>
<namePart type="family">Wilcox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Meister</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Cotterell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Surprisal theory (Hale, 2001; Levy, 2008) posits that a word’s reading time is proportional to its surprisal (i.e., to its negative log probability given the proceeding context). Since we are unable to access a word’s ground-truth probability, surprisal theory has been empirically tested using surprisal estimates from language models (LMs). Under the premise that surprisal theory holds, we would expect that higher quality language models provide more powerful predictors of human reading behavior—a conjecture we dub the quality–power (QP) hypothesis. Unfortunately, empirical support for the QP hypothesis is mixed. Some studies in English have found correlations between LM quality and predictive power, but other studies using Japanese data, as well as using larger English LMs, find no such correlations. In this work, we conduct a systematic crosslinguistic assessment of the QP hypothesis. We train LMs from scratch on small- and medium-sized datasets from 13 languages (across five language families) and assess their ability to predict eye tracking data. We find correlations between LM quality and power in eleven of these thirteen languages, suggesting that, within the range of model classes and sizes tested, better language models are indeed better predictors of human language processing behaviors.</abstract>
<identifier type="citekey">wilcox-etal-2023-language</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.466</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.466</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>7503</start>
<end>7511</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language Model Quality Correlates with Psychometric Predictive Power in Multiple Languages
%A Wilcox, Ethan
%A Meister, Clara
%A Cotterell, Ryan
%A Pimentel, Tiago
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F wilcox-etal-2023-language
%X Surprisal theory (Hale, 2001; Levy, 2008) posits that a word’s reading time is proportional to its surprisal (i.e., to its negative log probability given the proceeding context). Since we are unable to access a word’s ground-truth probability, surprisal theory has been empirically tested using surprisal estimates from language models (LMs). Under the premise that surprisal theory holds, we would expect that higher quality language models provide more powerful predictors of human reading behavior—a conjecture we dub the quality–power (QP) hypothesis. Unfortunately, empirical support for the QP hypothesis is mixed. Some studies in English have found correlations between LM quality and predictive power, but other studies using Japanese data, as well as using larger English LMs, find no such correlations. In this work, we conduct a systematic crosslinguistic assessment of the QP hypothesis. We train LMs from scratch on small- and medium-sized datasets from 13 languages (across five language families) and assess their ability to predict eye tracking data. We find correlations between LM quality and power in eleven of these thirteen languages, suggesting that, within the range of model classes and sizes tested, better language models are indeed better predictors of human language processing behaviors.
%R 10.18653/v1/2023.emnlp-main.466
%U https://aclanthology.org/2023.emnlp-main.466
%U https://doi.org/10.18653/v1/2023.emnlp-main.466
%P 7503-7511
Markdown (Informal)
[Language Model Quality Correlates with Psychometric Predictive Power in Multiple Languages](https://aclanthology.org/2023.emnlp-main.466) (Wilcox et al., EMNLP 2023)
ACL