@inproceedings{pimentel-meister-2024-compute,
title = "How to Compute the Probability of a Word",
author = "Pimentel, Tiago and
Meister, Clara",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1020/",
doi = "10.18653/v1/2024.emnlp-main.1020",
pages = "18358--18375",
abstract = "Language models (LMs) estimate a probability distribution over strings in a natural language; these distributions are crucial for computing perplexity and surprisal in linguistics research. While we are usually concerned with measuring these values for words, most LMs operate over subwords. Despite seemingly straightforward, accurately computing probabilities over one unit given probabilities over the other requires care. Indeed, we show here that many recent linguistic studies have been incorrectly computing these values. This paper derives the correct methods for computing word probabilities, highlighting issues when relying on language models that use beginning-of-word (bow)-marking tokenisers, e.g., the GPT family. Empirically, we show that correcting the widespread bug in probability computations affects measured outcomes in sentence comprehension and lexical optimisation analyses."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pimentel-meister-2024-compute">
<titleInfo>
<title>How to Compute the Probability of a Word</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tiago</namePart>
<namePart type="family">Pimentel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Meister</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language models (LMs) estimate a probability distribution over strings in a natural language; these distributions are crucial for computing perplexity and surprisal in linguistics research. While we are usually concerned with measuring these values for words, most LMs operate over subwords. Despite seemingly straightforward, accurately computing probabilities over one unit given probabilities over the other requires care. Indeed, we show here that many recent linguistic studies have been incorrectly computing these values. This paper derives the correct methods for computing word probabilities, highlighting issues when relying on language models that use beginning-of-word (bow)-marking tokenisers, e.g., the GPT family. Empirically, we show that correcting the widespread bug in probability computations affects measured outcomes in sentence comprehension and lexical optimisation analyses.</abstract>
<identifier type="citekey">pimentel-meister-2024-compute</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1020</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1020/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>18358</start>
<end>18375</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How to Compute the Probability of a Word
%A Pimentel, Tiago
%A Meister, Clara
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F pimentel-meister-2024-compute
%X Language models (LMs) estimate a probability distribution over strings in a natural language; these distributions are crucial for computing perplexity and surprisal in linguistics research. While we are usually concerned with measuring these values for words, most LMs operate over subwords. Despite seemingly straightforward, accurately computing probabilities over one unit given probabilities over the other requires care. Indeed, we show here that many recent linguistic studies have been incorrectly computing these values. This paper derives the correct methods for computing word probabilities, highlighting issues when relying on language models that use beginning-of-word (bow)-marking tokenisers, e.g., the GPT family. Empirically, we show that correcting the widespread bug in probability computations affects measured outcomes in sentence comprehension and lexical optimisation analyses.
%R 10.18653/v1/2024.emnlp-main.1020
%U https://aclanthology.org/2024.emnlp-main.1020/
%U https://doi.org/10.18653/v1/2024.emnlp-main.1020
%P 18358-18375
Markdown (Informal)
[How to Compute the Probability of a Word](https://aclanthology.org/2024.emnlp-main.1020/) (Pimentel & Meister, EMNLP 2024)
ACL
- Tiago Pimentel and Clara Meister. 2024. How to Compute the Probability of a Word. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 18358–18375, Miami, Florida, USA. Association for Computational Linguistics.