@article{gao-etal-2025-llm,
title = "{LLM}-based {NLG} Evaluation: Current Status and Challenges",
author = "Gao, Mingqi and
Hu, Xinyu and
Yin, Xunjian and
Ruan, Jie and
Pu, Xiao and
Wan, Xiaojun",
journal = "Computational Linguistics",
volume = "51",
month = jun,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.cl-2.9/",
doi = "10.1162/coli_a_00561",
pages = "661--687",
abstract = "Evaluating natural language generation (NLG) is a vital but challenging problem in natural language processing. Traditional evaluation metrics mainly capturing content (e.g., n-gram) overlap between system outputs and references are far from satisfactory, and large language models (LLMs) such as ChatGPT have demonstrated great potential in NLG evaluation in recent years. Various automatic evaluation methods based on LLMs have been proposed, including metrics derived from LLMs, prompting LLMs, fine-tuning LLMs, and human{--}LLM collaborative evaluation. In this survey, we first give a taxonomy of LLM-based NLG evaluation methods, and discuss their pros and cons, respectively. Lastly, we discuss several open problems in this area and point out future research directions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gao-etal-2025-llm">
<titleInfo>
<title>LLM-based NLG Evaluation: Current Status and Challenges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mingqi</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xunjian</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Ruan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiao</namePart>
<namePart type="family">Pu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaojun</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Evaluating natural language generation (NLG) is a vital but challenging problem in natural language processing. Traditional evaluation metrics mainly capturing content (e.g., n-gram) overlap between system outputs and references are far from satisfactory, and large language models (LLMs) such as ChatGPT have demonstrated great potential in NLG evaluation in recent years. Various automatic evaluation methods based on LLMs have been proposed, including metrics derived from LLMs, prompting LLMs, fine-tuning LLMs, and human–LLM collaborative evaluation. In this survey, we first give a taxonomy of LLM-based NLG evaluation methods, and discuss their pros and cons, respectively. Lastly, we discuss several open problems in this area and point out future research directions.</abstract>
<identifier type="citekey">gao-etal-2025-llm</identifier>
<identifier type="doi">10.1162/coli_a_00561</identifier>
<location>
<url>https://aclanthology.org/2025.cl-2.9/</url>
</location>
<part>
<date>2025-06</date>
<detail type="volume"><number>51</number></detail>
<extent unit="page">
<start>661</start>
<end>687</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T LLM-based NLG Evaluation: Current Status and Challenges
%A Gao, Mingqi
%A Hu, Xinyu
%A Yin, Xunjian
%A Ruan, Jie
%A Pu, Xiao
%A Wan, Xiaojun
%J Computational Linguistics
%D 2025
%8 June
%V 51
%I MIT Press
%C Cambridge, MA
%F gao-etal-2025-llm
%X Evaluating natural language generation (NLG) is a vital but challenging problem in natural language processing. Traditional evaluation metrics mainly capturing content (e.g., n-gram) overlap between system outputs and references are far from satisfactory, and large language models (LLMs) such as ChatGPT have demonstrated great potential in NLG evaluation in recent years. Various automatic evaluation methods based on LLMs have been proposed, including metrics derived from LLMs, prompting LLMs, fine-tuning LLMs, and human–LLM collaborative evaluation. In this survey, we first give a taxonomy of LLM-based NLG evaluation methods, and discuss their pros and cons, respectively. Lastly, we discuss several open problems in this area and point out future research directions.
%R 10.1162/coli_a_00561
%U https://aclanthology.org/2025.cl-2.9/
%U https://doi.org/10.1162/coli_a_00561
%P 661-687
Markdown (Informal)
[LLM-based NLG Evaluation: Current Status and Challenges](https://aclanthology.org/2025.cl-2.9/) (Gao et al., CL 2025)
ACL