@inproceedings{li-etal-2025-longtablebench,
title = "{L}ong{T}able{B}ench: Benchmarking Long-Context Table Reasoning across Real-World Formats and Domains",
author = "Li, Liyao and
Tian, Jiaming and
Chen, Hao and
Ye, Wentao and
Ye, Chao and
Wang, Haobo and
Wang, Ningtao and
Fu, Xing and
Chen, Gang and
Zhao, Junbo",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.638/",
pages = "11927--11965",
ISBN = "979-8-89176-335-7",
abstract = "We introduce **LongTableBench**, a benchmark for evaluating long-context reasoning over semi-structured tables across diverse formats, tasks, and domains. It comprises 5,950 QA instances spanning 7 table formats (e.g., Markdown, HTML, SQL), 18 domains, and input lengths up to 128K tokens, including multi-turn and multi-table settings. To ensure data quality, we combine symbolic supervision, cross-model validation, and human review. Evaluating 52 LLMs{---}including general-purpose, table-specific, and reasoning-enhanced models{---}reveals that only the strongest models maintain robust performance under increasing context lengths and format diversity. We further show that end-to-end models outperform compression-based approaches, especially on tasks requiring semantic integration. LongTableBench provides a rigorous, scalable testbed for advancing long-context tabular understanding and highlights key limitations in current LLMs' structural and reasoning capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2025-longtablebench">
<titleInfo>
<title>LongTableBench: Benchmarking Long-Context Table Reasoning across Real-World Formats and Domains</title>
</titleInfo>
<name type="personal">
<namePart type="given">Liyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaming</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wentao</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chao</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haobo</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ningtao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xing</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junbo</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>We introduce **LongTableBench**, a benchmark for evaluating long-context reasoning over semi-structured tables across diverse formats, tasks, and domains. It comprises 5,950 QA instances spanning 7 table formats (e.g., Markdown, HTML, SQL), 18 domains, and input lengths up to 128K tokens, including multi-turn and multi-table settings. To ensure data quality, we combine symbolic supervision, cross-model validation, and human review. Evaluating 52 LLMs—including general-purpose, table-specific, and reasoning-enhanced models—reveals that only the strongest models maintain robust performance under increasing context lengths and format diversity. We further show that end-to-end models outperform compression-based approaches, especially on tasks requiring semantic integration. LongTableBench provides a rigorous, scalable testbed for advancing long-context tabular understanding and highlights key limitations in current LLMs’ structural and reasoning capabilities.</abstract>
<identifier type="citekey">li-etal-2025-longtablebench</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.638/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>11927</start>
<end>11965</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LongTableBench: Benchmarking Long-Context Table Reasoning across Real-World Formats and Domains
%A Li, Liyao
%A Tian, Jiaming
%A Chen, Hao
%A Ye, Wentao
%A Ye, Chao
%A Wang, Haobo
%A Wang, Ningtao
%A Fu, Xing
%A Chen, Gang
%A Zhao, Junbo
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F li-etal-2025-longtablebench
%X We introduce **LongTableBench**, a benchmark for evaluating long-context reasoning over semi-structured tables across diverse formats, tasks, and domains. It comprises 5,950 QA instances spanning 7 table formats (e.g., Markdown, HTML, SQL), 18 domains, and input lengths up to 128K tokens, including multi-turn and multi-table settings. To ensure data quality, we combine symbolic supervision, cross-model validation, and human review. Evaluating 52 LLMs—including general-purpose, table-specific, and reasoning-enhanced models—reveals that only the strongest models maintain robust performance under increasing context lengths and format diversity. We further show that end-to-end models outperform compression-based approaches, especially on tasks requiring semantic integration. LongTableBench provides a rigorous, scalable testbed for advancing long-context tabular understanding and highlights key limitations in current LLMs’ structural and reasoning capabilities.
%U https://aclanthology.org/2025.findings-emnlp.638/
%P 11927-11965
Markdown (Informal)
[LongTableBench: Benchmarking Long-Context Table Reasoning across Real-World Formats and Domains](https://aclanthology.org/2025.findings-emnlp.638/) (Li et al., Findings 2025)
ACL
- Liyao Li, Jiaming Tian, Hao Chen, Wentao Ye, Chao Ye, Haobo Wang, Ningtao Wang, Xing Fu, Gang Chen, and Junbo Zhao. 2025. LongTableBench: Benchmarking Long-Context Table Reasoning across Real-World Formats and Domains. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 11927–11965, Suzhou, China. Association for Computational Linguistics.