@inproceedings{kelious-etal-2025-large,
title = "How Do Large Language Models Evaluate Lexical Complexity?",
author = "Kelious, Abdelhak and
Constant, Mathieu and
Coeur, Christophe",
editor = "Frermann, Lea and
Stevenson, Mark",
booktitle = "Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.starsem-1.28/",
pages = "348--361",
ISBN = "979-8-89176-340-1",
abstract = "In this work, we explore the prediction of lexical complexity by combining supervised approaches and the use of large language models (LLMs). We first evaluate the impact of different prompting strategies (zero-shot, one-shot, and chain-of-thought) on the quality of the predictions, comparing the results with human annotations from the CompLex 2.0 corpus. Our results indicate that LLMs, and in particular gpt-4o, benefit from explicit instructions to better approximate human judgments, although some discrepancies remain. Moreover, a calibration approach to better align LLMs predictions and human judgements based on few manually annotated data appears as a promising solution to improve the reliability of the annotations in a supervised scenario."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kelious-etal-2025-large">
<titleInfo>
<title>How Do Large Language Models Evaluate Lexical Complexity?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abdelhak</namePart>
<namePart type="family">Kelious</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mathieu</namePart>
<namePart type="family">Constant</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christophe</namePart>
<namePart type="family">Coeur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lea</namePart>
<namePart type="family">Frermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Stevenson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-340-1</identifier>
</relatedItem>
<abstract>In this work, we explore the prediction of lexical complexity by combining supervised approaches and the use of large language models (LLMs). We first evaluate the impact of different prompting strategies (zero-shot, one-shot, and chain-of-thought) on the quality of the predictions, comparing the results with human annotations from the CompLex 2.0 corpus. Our results indicate that LLMs, and in particular gpt-4o, benefit from explicit instructions to better approximate human judgments, although some discrepancies remain. Moreover, a calibration approach to better align LLMs predictions and human judgements based on few manually annotated data appears as a promising solution to improve the reliability of the annotations in a supervised scenario.</abstract>
<identifier type="citekey">kelious-etal-2025-large</identifier>
<location>
<url>https://aclanthology.org/2025.starsem-1.28/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>348</start>
<end>361</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Do Large Language Models Evaluate Lexical Complexity?
%A Kelious, Abdelhak
%A Constant, Mathieu
%A Coeur, Christophe
%Y Frermann, Lea
%Y Stevenson, Mark
%S Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-340-1
%F kelious-etal-2025-large
%X In this work, we explore the prediction of lexical complexity by combining supervised approaches and the use of large language models (LLMs). We first evaluate the impact of different prompting strategies (zero-shot, one-shot, and chain-of-thought) on the quality of the predictions, comparing the results with human annotations from the CompLex 2.0 corpus. Our results indicate that LLMs, and in particular gpt-4o, benefit from explicit instructions to better approximate human judgments, although some discrepancies remain. Moreover, a calibration approach to better align LLMs predictions and human judgements based on few manually annotated data appears as a promising solution to improve the reliability of the annotations in a supervised scenario.
%U https://aclanthology.org/2025.starsem-1.28/
%P 348-361
Markdown (Informal)
[How Do Large Language Models Evaluate Lexical Complexity?](https://aclanthology.org/2025.starsem-1.28/) (Kelious et al., *SEM 2025)
ACL
- Abdelhak Kelious, Mathieu Constant, and Christophe Coeur. 2025. How Do Large Language Models Evaluate Lexical Complexity?. In Proceedings of the 14th Joint Conference on Lexical and Computational Semantics (*SEM 2025), pages 348–361, Suzhou, China. Association for Computational Linguistics.