@inproceedings{shaib-etal-2025-standardizing,
title = "Standardizing the Measurement of Text Diversity: A Tool and Comparative Analysis",
author = "Shaib, Chantal and
Govindarajan, Venkata S and
Barrow, Joe and
Sun, Jiuding and
Siu, Alexa and
Wallace, Byron C and
Nenkova, Ani",
editor = "Liu, Xuebo and
Purwarianti, Ayu",
booktitle = "Proceedings of The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: System Demonstrations",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.ijcnlp-demo.5/",
pages = "36--46",
ISBN = "979-8-89176-301-2",
abstract = "The diversity across outputs generated by LLMs shapes perception of their quality and utility. High lexical diversity is often desirable, but there is no standard method to measure this property. Templated answer structures and ``canned'' responses across different documents are readily noticeable, but difficult to visualize across large corpora. This work aims to standardize measurement of text diversity. Specifically, we empirically investigate the convergent validity of existing scores across English texts, and release diversity, an open-source Python package (https://pypi.org/project/diversity/, https://github.com/cshaib/diversity) for measuring and extracting repetition in text. We also build a platform (https://ai-templates.app) based on diversity for users to interactively explore repetition in text. We find that fast compression algorithms capture information similar to what is measured by slow-to-compute $n$-gram overlap homogeneity scores. Further, a combination of measures{---}compression ratios, self-repetition of long $n$-grams, and Self-BLEU{---}are sufficient to report, as they have low mutual correlation with each other."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shaib-etal-2025-standardizing">
<titleInfo>
<title>Standardizing the Measurement of Text Diversity: A Tool and Comparative Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chantal</namePart>
<namePart type="family">Shaib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Venkata</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Govindarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joe</namePart>
<namePart type="family">Barrow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiuding</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexa</namePart>
<namePart type="family">Siu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Byron</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Wallace</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ani</namePart>
<namePart type="family">Nenkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xuebo</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayu</namePart>
<namePart type="family">Purwarianti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-301-2</identifier>
</relatedItem>
<abstract>The diversity across outputs generated by LLMs shapes perception of their quality and utility. High lexical diversity is often desirable, but there is no standard method to measure this property. Templated answer structures and “canned” responses across different documents are readily noticeable, but difficult to visualize across large corpora. This work aims to standardize measurement of text diversity. Specifically, we empirically investigate the convergent validity of existing scores across English texts, and release diversity, an open-source Python package (https://pypi.org/project/diversity/, https://github.com/cshaib/diversity) for measuring and extracting repetition in text. We also build a platform (https://ai-templates.app) based on diversity for users to interactively explore repetition in text. We find that fast compression algorithms capture information similar to what is measured by slow-to-compute n-gram overlap homogeneity scores. Further, a combination of measures—compression ratios, self-repetition of long n-grams, and Self-BLEU—are sufficient to report, as they have low mutual correlation with each other.</abstract>
<identifier type="citekey">shaib-etal-2025-standardizing</identifier>
<location>
<url>https://aclanthology.org/2025.ijcnlp-demo.5/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>36</start>
<end>46</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Standardizing the Measurement of Text Diversity: A Tool and Comparative Analysis
%A Shaib, Chantal
%A Govindarajan, Venkata S.
%A Barrow, Joe
%A Sun, Jiuding
%A Siu, Alexa
%A Wallace, Byron C.
%A Nenkova, Ani
%Y Liu, Xuebo
%Y Purwarianti, Ayu
%S Proceedings of The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: System Demonstrations
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-301-2
%F shaib-etal-2025-standardizing
%X The diversity across outputs generated by LLMs shapes perception of their quality and utility. High lexical diversity is often desirable, but there is no standard method to measure this property. Templated answer structures and “canned” responses across different documents are readily noticeable, but difficult to visualize across large corpora. This work aims to standardize measurement of text diversity. Specifically, we empirically investigate the convergent validity of existing scores across English texts, and release diversity, an open-source Python package (https://pypi.org/project/diversity/, https://github.com/cshaib/diversity) for measuring and extracting repetition in text. We also build a platform (https://ai-templates.app) based on diversity for users to interactively explore repetition in text. We find that fast compression algorithms capture information similar to what is measured by slow-to-compute n-gram overlap homogeneity scores. Further, a combination of measures—compression ratios, self-repetition of long n-grams, and Self-BLEU—are sufficient to report, as they have low mutual correlation with each other.
%U https://aclanthology.org/2025.ijcnlp-demo.5/
%P 36-46
Markdown (Informal)
[Standardizing the Measurement of Text Diversity: A Tool and Comparative Analysis](https://aclanthology.org/2025.ijcnlp-demo.5/) (Shaib et al., IJCNLP 2025)
ACL
- Chantal Shaib, Venkata S Govindarajan, Joe Barrow, Jiuding Sun, Alexa Siu, Byron C Wallace, and Ani Nenkova. 2025. Standardizing the Measurement of Text Diversity: A Tool and Comparative Analysis. In Proceedings of The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: System Demonstrations, pages 36–46, Mumbai, India. Association for Computational Linguistics.