@inproceedings{huidrom-etal-2025-assessing,
title = "Assessing Semantic Consistency in {D}ata{-}to{-}{T}ext Generation: A Meta-Evaluation of Textual, Semantic and Model-Based Metrics",
author = "Huidrom, Rudali and
Lorandi, Michela and
Mille, Simon and
Thomson, Craig and
Belz, Anya",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.inlg-main.6/",
pages = "98--107",
abstract = "Ensuring semantic consistency between semantic-triple inputs and generated text is crucial in data{-}to{-}text generation, but continues to pose challenges both during generation and in evaluation. In order to assess how accurately semantic consistency can currently be assessed, we meta-evaluate 29 different evaluation methods in terms of their ability to predict human semantic-consistency ratings. The evaluation methods include embeddings{-}based, overlap{-}based, and edit{-}distance metrics, as well as learned regressors and a prompted `LLM{-}as{-}judge' protocol. We meta-evaluate on two datasets: the WebNLG 2017 human evaluation dataset, and a newly created WebNLG-style dataset that none of the methods can have seen during training. We find that none of the traditional textual similarity metrics or the pre-Transformer model-based metrics are suitable for the task of semantic consistency assessment. LLM-based methods perform well on the whole, but best correlations with human judgments still lag behind those seen in other text generation tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huidrom-etal-2025-assessing">
<titleInfo>
<title>Assessing Semantic Consistency in Data-to-Text Generation: A Meta-Evaluation of Textual, Semantic and Model-Based Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rudali</namePart>
<namePart type="family">Huidrom</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michela</namePart>
<namePart type="family">Lorandi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Craig</namePart>
<namePart type="family">Thomson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Narayan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lê</namePart>
<namePart type="given">Hồng</namePart>
<namePart type="family">Phương</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahuan</namePart>
<namePart type="family">Pei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hanoi, Vietnam</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Ensuring semantic consistency between semantic-triple inputs and generated text is crucial in data-to-text generation, but continues to pose challenges both during generation and in evaluation. In order to assess how accurately semantic consistency can currently be assessed, we meta-evaluate 29 different evaluation methods in terms of their ability to predict human semantic-consistency ratings. The evaluation methods include embeddings-based, overlap-based, and edit-distance metrics, as well as learned regressors and a prompted ‘LLM-as-judge’ protocol. We meta-evaluate on two datasets: the WebNLG 2017 human evaluation dataset, and a newly created WebNLG-style dataset that none of the methods can have seen during training. We find that none of the traditional textual similarity metrics or the pre-Transformer model-based metrics are suitable for the task of semantic consistency assessment. LLM-based methods perform well on the whole, but best correlations with human judgments still lag behind those seen in other text generation tasks.</abstract>
<identifier type="citekey">huidrom-etal-2025-assessing</identifier>
<location>
<url>https://aclanthology.org/2025.inlg-main.6/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>98</start>
<end>107</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing Semantic Consistency in Data-to-Text Generation: A Meta-Evaluation of Textual, Semantic and Model-Based Metrics
%A Huidrom, Rudali
%A Lorandi, Michela
%A Mille, Simon
%A Thomson, Craig
%A Belz, Anya
%Y Flek, Lucie
%Y Narayan, Shashi
%Y Phương, Lê Hồng
%Y Pei, Jiahuan
%S Proceedings of the 18th International Natural Language Generation Conference
%D 2025
%8 October
%I Association for Computational Linguistics
%C Hanoi, Vietnam
%F huidrom-etal-2025-assessing
%X Ensuring semantic consistency between semantic-triple inputs and generated text is crucial in data-to-text generation, but continues to pose challenges both during generation and in evaluation. In order to assess how accurately semantic consistency can currently be assessed, we meta-evaluate 29 different evaluation methods in terms of their ability to predict human semantic-consistency ratings. The evaluation methods include embeddings-based, overlap-based, and edit-distance metrics, as well as learned regressors and a prompted ‘LLM-as-judge’ protocol. We meta-evaluate on two datasets: the WebNLG 2017 human evaluation dataset, and a newly created WebNLG-style dataset that none of the methods can have seen during training. We find that none of the traditional textual similarity metrics or the pre-Transformer model-based metrics are suitable for the task of semantic consistency assessment. LLM-based methods perform well on the whole, but best correlations with human judgments still lag behind those seen in other text generation tasks.
%U https://aclanthology.org/2025.inlg-main.6/
%P 98-107
Markdown (Informal)
[Assessing Semantic Consistency in Data‐to‐Text Generation: A Meta-Evaluation of Textual, Semantic and Model-Based Metrics](https://aclanthology.org/2025.inlg-main.6/) (Huidrom et al., INLG 2025)
ACL