@inproceedings{trigg-etal-2026-logic,
title = "Logic-Level Evaluation of Logical Table-to-Text Generation",
author = "Trigg, Lena and
Hougen, Dean F. and
Bilal, Ahsan",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.conll-main.41/",
pages = "677--691",
ISBN = "979-8-89176-410-1",
abstract = "Logical Table-to-Text (LT2T) generation aims to produce natural-language sentences that are logically faithful to structured tabular data. While recent Large Language Models (LLMs) show high performance on aggregate fidelity metrics, these scores provide only a coarse view of performance, obscuring specific logic-type reasoning failures and models' meta-logical awareness. We propose an operation-aware diagnostic framework that evaluates four core competencies: (1) Logical Form (LF) execution accuracy, (2) fidelity of LF-conditioned generation, (3) logic-type identification, and (4) LF-free generation.We apply this framework to a suite of frontier LLMs and perform fine-grained analysis across logic types such as aggregation, ordinal, and superlative reasoning. Our results show that LT2T fidelity assessment can be unstable; the choice of verifier and logic type can substantially alter conclusions and model rankings. Crucially, we identify a meta-logical gap: models often generate faithful statements while failing to identify the underlying operation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="trigg-etal-2026-logic">
<titleInfo>
<title>Logic-Level Evaluation of Logical Table-to-Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lena</namePart>
<namePart type="family">Trigg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dean</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Hougen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahsan</namePart>
<namePart type="family">Bilal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 30th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevgeni</namePart>
<namePart type="family">Berzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-410-1</identifier>
</relatedItem>
<abstract>Logical Table-to-Text (LT2T) generation aims to produce natural-language sentences that are logically faithful to structured tabular data. While recent Large Language Models (LLMs) show high performance on aggregate fidelity metrics, these scores provide only a coarse view of performance, obscuring specific logic-type reasoning failures and models’ meta-logical awareness. We propose an operation-aware diagnostic framework that evaluates four core competencies: (1) Logical Form (LF) execution accuracy, (2) fidelity of LF-conditioned generation, (3) logic-type identification, and (4) LF-free generation.We apply this framework to a suite of frontier LLMs and perform fine-grained analysis across logic types such as aggregation, ordinal, and superlative reasoning. Our results show that LT2T fidelity assessment can be unstable; the choice of verifier and logic type can substantially alter conclusions and model rankings. Crucially, we identify a meta-logical gap: models often generate faithful statements while failing to identify the underlying operation.</abstract>
<identifier type="citekey">trigg-etal-2026-logic</identifier>
<location>
<url>https://aclanthology.org/2026.conll-main.41/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>677</start>
<end>691</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Logic-Level Evaluation of Logical Table-to-Text Generation
%A Trigg, Lena
%A Hougen, Dean F.
%A Bilal, Ahsan
%Y Bonial, Claire
%Y Berzak, Yevgeni
%S Proceedings of the 30th Conference on Computational Natural Language Learning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-410-1
%F trigg-etal-2026-logic
%X Logical Table-to-Text (LT2T) generation aims to produce natural-language sentences that are logically faithful to structured tabular data. While recent Large Language Models (LLMs) show high performance on aggregate fidelity metrics, these scores provide only a coarse view of performance, obscuring specific logic-type reasoning failures and models’ meta-logical awareness. We propose an operation-aware diagnostic framework that evaluates four core competencies: (1) Logical Form (LF) execution accuracy, (2) fidelity of LF-conditioned generation, (3) logic-type identification, and (4) LF-free generation.We apply this framework to a suite of frontier LLMs and perform fine-grained analysis across logic types such as aggregation, ordinal, and superlative reasoning. Our results show that LT2T fidelity assessment can be unstable; the choice of verifier and logic type can substantially alter conclusions and model rankings. Crucially, we identify a meta-logical gap: models often generate faithful statements while failing to identify the underlying operation.
%U https://aclanthology.org/2026.conll-main.41/
%P 677-691
Markdown (Informal)
[Logic-Level Evaluation of Logical Table-to-Text Generation](https://aclanthology.org/2026.conll-main.41/) (Trigg et al., CoNLL 2026)
ACL