@inproceedings{arai-etal-2025-evaluating,
title = "Evaluating {LLM}s' Ability to Understand Numerical Time Series for Text Generation",
author = "Arai, Mizuki and
Ishigaki, Tatsuya and
Kawarada, Masayuki and
Miyao, Yusuke and
Takamura, Hiroya and
Kobayashi, Ichiro",
editor = "Flek, Lucie and
Narayan, Shashi and
Phương, L{\^e} Hồng and
Pei, Jiahuan",
booktitle = "Proceedings of the 18th International Natural Language Generation Conference",
month = oct,
year = "2025",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.inlg-main.16/",
pages = "232--248",
abstract = "Data-to-text generation tasks often involve processing numerical time-series as input such as financial statistics or meteorological data. Although large language models (LLMs) are a powerful approach to data-to-text, we still lack a comprehensive understanding of how well they actually understand time-series data. We therefore introduce a benchmark with 18 evaluation tasks to assess LLMs' abilities of interpreting numerical time-series, which are categorized into: 1) event detection{---}identifying maxima and minima; 2) computation{---}averaging and summation; 3) pairwise comparison{---}comparing values over time; and 4) inference{---}imputation and forecasting. Our experiments reveal five key findings: 1) even state-of-the-art LLMs struggle with complex multi-step reasoning; 2) tasks that require extracting values or performing computations within a specified range of the time-series significantly reduce accuracy; 3) instruction tuning offers inconsistent improvements for numerical interpretation; 4) reasoning-based models outperform standard LLMs in complex numerical tasks; and 5) LLMs perform interpolation better than forecasting. These results establish a clear baseline and serve as a wake-up call for anyone aiming to blend fluent language with trustworthy numeric precision in time-series scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arai-etal-2025-evaluating">
<titleInfo>
<title>Evaluating LLMs’ Ability to Understand Numerical Time Series for Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mizuki</namePart>
<namePart type="family">Arai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatsuya</namePart>
<namePart type="family">Ishigaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masayuki</namePart>
<namePart type="family">Kawarada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuke</namePart>
<namePart type="family">Miyao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ichiro</namePart>
<namePart type="family">Kobayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shashi</namePart>
<namePart type="family">Narayan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lê</namePart>
<namePart type="given">Hồng</namePart>
<namePart type="family">Phương</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahuan</namePart>
<namePart type="family">Pei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Hanoi, Vietnam</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data-to-text generation tasks often involve processing numerical time-series as input such as financial statistics or meteorological data. Although large language models (LLMs) are a powerful approach to data-to-text, we still lack a comprehensive understanding of how well they actually understand time-series data. We therefore introduce a benchmark with 18 evaluation tasks to assess LLMs’ abilities of interpreting numerical time-series, which are categorized into: 1) event detection—identifying maxima and minima; 2) computation—averaging and summation; 3) pairwise comparison—comparing values over time; and 4) inference—imputation and forecasting. Our experiments reveal five key findings: 1) even state-of-the-art LLMs struggle with complex multi-step reasoning; 2) tasks that require extracting values or performing computations within a specified range of the time-series significantly reduce accuracy; 3) instruction tuning offers inconsistent improvements for numerical interpretation; 4) reasoning-based models outperform standard LLMs in complex numerical tasks; and 5) LLMs perform interpolation better than forecasting. These results establish a clear baseline and serve as a wake-up call for anyone aiming to blend fluent language with trustworthy numeric precision in time-series scenarios.</abstract>
<identifier type="citekey">arai-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.inlg-main.16/</url>
</location>
<part>
<date>2025-10</date>
<extent unit="page">
<start>232</start>
<end>248</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating LLMs’ Ability to Understand Numerical Time Series for Text Generation
%A Arai, Mizuki
%A Ishigaki, Tatsuya
%A Kawarada, Masayuki
%A Miyao, Yusuke
%A Takamura, Hiroya
%A Kobayashi, Ichiro
%Y Flek, Lucie
%Y Narayan, Shashi
%Y Phương, Lê Hồng
%Y Pei, Jiahuan
%S Proceedings of the 18th International Natural Language Generation Conference
%D 2025
%8 October
%I Association for Computational Linguistics
%C Hanoi, Vietnam
%F arai-etal-2025-evaluating
%X Data-to-text generation tasks often involve processing numerical time-series as input such as financial statistics or meteorological data. Although large language models (LLMs) are a powerful approach to data-to-text, we still lack a comprehensive understanding of how well they actually understand time-series data. We therefore introduce a benchmark with 18 evaluation tasks to assess LLMs’ abilities of interpreting numerical time-series, which are categorized into: 1) event detection—identifying maxima and minima; 2) computation—averaging and summation; 3) pairwise comparison—comparing values over time; and 4) inference—imputation and forecasting. Our experiments reveal five key findings: 1) even state-of-the-art LLMs struggle with complex multi-step reasoning; 2) tasks that require extracting values or performing computations within a specified range of the time-series significantly reduce accuracy; 3) instruction tuning offers inconsistent improvements for numerical interpretation; 4) reasoning-based models outperform standard LLMs in complex numerical tasks; and 5) LLMs perform interpolation better than forecasting. These results establish a clear baseline and serve as a wake-up call for anyone aiming to blend fluent language with trustworthy numeric precision in time-series scenarios.
%U https://aclanthology.org/2025.inlg-main.16/
%P 232-248
Markdown (Informal)
[Evaluating LLMs’ Ability to Understand Numerical Time Series for Text Generation](https://aclanthology.org/2025.inlg-main.16/) (Arai et al., INLG 2025)
ACL