@inproceedings{mikhaylovskiy-2025-zipfs,
title = "{Z}ipf{'}s and Heaps' Laws for Tokens and {LLM}-generated Texts",
author = "Mikhaylovskiy, Nikolay",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.837/",
doi = "10.18653/v1/2025.findings-emnlp.837",
pages = "15469--15481",
ISBN = "979-8-89176-335-7",
abstract = "The frequency distribution of words in human-written texts roughly follows a simple mathematical form known as Zipf{'}s law. Somewhat less well known is the related Heaps' law, which describes a sublinear power-law growth of vocabulary size with document size. We study the applicability of Zipf{'}s and Heaps' laws to texts generated by Large Language Models (LLMs). We empirically show that Heaps' and Zipf{'}s laws only hold for LLM-generated texts in a narrow model-dependent temperature range. These temperatures have an optimal value close to $t=1$ for all the base models except the large Llama models, are higher for instruction-finetuned models and do not depend on the model size or prompting. This independently confirms the recent discovery of sampling temperature dependent phase transitions in LLM-generated texts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mikhaylovskiy-2025-zipfs">
<titleInfo>
<title>Zipf’s and Heaps’ Laws for Tokens and LLM-generated Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Mikhaylovskiy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>The frequency distribution of words in human-written texts roughly follows a simple mathematical form known as Zipf’s law. Somewhat less well known is the related Heaps’ law, which describes a sublinear power-law growth of vocabulary size with document size. We study the applicability of Zipf’s and Heaps’ laws to texts generated by Large Language Models (LLMs). We empirically show that Heaps’ and Zipf’s laws only hold for LLM-generated texts in a narrow model-dependent temperature range. These temperatures have an optimal value close to t=1 for all the base models except the large Llama models, are higher for instruction-finetuned models and do not depend on the model size or prompting. This independently confirms the recent discovery of sampling temperature dependent phase transitions in LLM-generated texts.</abstract>
<identifier type="citekey">mikhaylovskiy-2025-zipfs</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.837</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.837/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>15469</start>
<end>15481</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Zipf’s and Heaps’ Laws for Tokens and LLM-generated Texts
%A Mikhaylovskiy, Nikolay
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F mikhaylovskiy-2025-zipfs
%X The frequency distribution of words in human-written texts roughly follows a simple mathematical form known as Zipf’s law. Somewhat less well known is the related Heaps’ law, which describes a sublinear power-law growth of vocabulary size with document size. We study the applicability of Zipf’s and Heaps’ laws to texts generated by Large Language Models (LLMs). We empirically show that Heaps’ and Zipf’s laws only hold for LLM-generated texts in a narrow model-dependent temperature range. These temperatures have an optimal value close to t=1 for all the base models except the large Llama models, are higher for instruction-finetuned models and do not depend on the model size or prompting. This independently confirms the recent discovery of sampling temperature dependent phase transitions in LLM-generated texts.
%R 10.18653/v1/2025.findings-emnlp.837
%U https://aclanthology.org/2025.findings-emnlp.837/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.837
%P 15469-15481
Markdown (Informal)
[Zipf’s and Heaps’ Laws for Tokens and LLM-generated Texts](https://aclanthology.org/2025.findings-emnlp.837/) (Mikhaylovskiy, Findings 2025)
ACL