@inproceedings{fitterer-etal-2025-testing,
title = "Testing {E}nglish News Articles for Lexical Homogenization Due to Widespread Use of Large Language Models",
author = "Fitterer, Sarah and
Gangl, Dominik and
Ulbrich, Jannes",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-srw.95/",
doi = "10.18653/v1/2025.acl-srw.95",
pages = "1239--1245",
ISBN = "979-8-89176-254-1",
abstract = "It is widely assumed that Large Language Models (LLMs) are shaping language, with multiple studies noting the growing presence of LLM-generated content and suggesting homogenizing effects. However, it remains unclear if these effects are already evident in recent writing. This study addresses that gap by comparing two datasets of English online news articles {--} one from 2018, prior to LLM popularization, and one from 2024, after widespread LLM adoption. We define lexical homogenization as a decrease in lexical diversity, measured by the MATTR, Maas, and MTLD metrics, and introduce the LLM-Style-Word Ratio (SWR) to measure LLM influence. We found higher MTLD and SWR scores, yet negligible changes in Maas and MATTR scores in 2024 corpus. We conclude that while there is an apparent influence of LLMs on written online English, homogenization effects do not show in the measurements. We therefore propose to apply different metrics to measure lexical homogenization in future studies on the influence of LLM usage on language change."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fitterer-etal-2025-testing">
<titleInfo>
<title>Testing English News Articles for Lexical Homogenization Due to Widespread Use of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Fitterer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dominik</namePart>
<namePart type="family">Gangl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jannes</namePart>
<namePart type="family">Ulbrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jin</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-254-1</identifier>
</relatedItem>
<abstract>It is widely assumed that Large Language Models (LLMs) are shaping language, with multiple studies noting the growing presence of LLM-generated content and suggesting homogenizing effects. However, it remains unclear if these effects are already evident in recent writing. This study addresses that gap by comparing two datasets of English online news articles – one from 2018, prior to LLM popularization, and one from 2024, after widespread LLM adoption. We define lexical homogenization as a decrease in lexical diversity, measured by the MATTR, Maas, and MTLD metrics, and introduce the LLM-Style-Word Ratio (SWR) to measure LLM influence. We found higher MTLD and SWR scores, yet negligible changes in Maas and MATTR scores in 2024 corpus. We conclude that while there is an apparent influence of LLMs on written online English, homogenization effects do not show in the measurements. We therefore propose to apply different metrics to measure lexical homogenization in future studies on the influence of LLM usage on language change.</abstract>
<identifier type="citekey">fitterer-etal-2025-testing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-srw.95</identifier>
<location>
<url>https://aclanthology.org/2025.acl-srw.95/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>1239</start>
<end>1245</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Testing English News Articles for Lexical Homogenization Due to Widespread Use of Large Language Models
%A Fitterer, Sarah
%A Gangl, Dominik
%A Ulbrich, Jannes
%Y Zhao, Jin
%Y Wang, Mingyang
%Y Liu, Zhu
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-254-1
%F fitterer-etal-2025-testing
%X It is widely assumed that Large Language Models (LLMs) are shaping language, with multiple studies noting the growing presence of LLM-generated content and suggesting homogenizing effects. However, it remains unclear if these effects are already evident in recent writing. This study addresses that gap by comparing two datasets of English online news articles – one from 2018, prior to LLM popularization, and one from 2024, after widespread LLM adoption. We define lexical homogenization as a decrease in lexical diversity, measured by the MATTR, Maas, and MTLD metrics, and introduce the LLM-Style-Word Ratio (SWR) to measure LLM influence. We found higher MTLD and SWR scores, yet negligible changes in Maas and MATTR scores in 2024 corpus. We conclude that while there is an apparent influence of LLMs on written online English, homogenization effects do not show in the measurements. We therefore propose to apply different metrics to measure lexical homogenization in future studies on the influence of LLM usage on language change.
%R 10.18653/v1/2025.acl-srw.95
%U https://aclanthology.org/2025.acl-srw.95/
%U https://doi.org/10.18653/v1/2025.acl-srw.95
%P 1239-1245
Markdown (Informal)
[Testing English News Articles for Lexical Homogenization Due to Widespread Use of Large Language Models](https://aclanthology.org/2025.acl-srw.95/) (Fitterer et al., ACL 2025)
ACL