@inproceedings{calais-etal-2025-disentangling,
title = "Disentangling Text and Math in Word Problems: Evidence for the Bidimensional Structure of Large Language Models' Reasoning",
author = "Calais, Pedro and
Franco, Gabriel and
Tang, Zilu and
Nikas, Themistoklis and
Jr., Wagner Meira and
Terzi, Evimaria and
Crovella, Mark",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.656/",
doi = "10.18653/v1/2025.findings-acl.656",
pages = "12671--12688",
ISBN = "979-8-89176-256-5",
abstract = "Do LLMs process text and mathematics as a unified skill, or do these components rely on distinct underlying mechanisms? We investigate this question by disentangling the textual interpretation and mathematical solving steps in word problems drawn from Brazil{'}s largest college entrance exam (ENEM) and GSM8K, a popular grade school-level benchmark. Using the symbolic solver SymPy, we transform word problems into equivalent purely mathematical representations, isolating equation formulation from textual comprehension. Our extended benchmarks enable a structured analysis of LLM performance across these two dimensions. Through empirical evaluations, we find that small-scale LLMs struggle significantly more with text interpretation than with equation solving, with accuracy dropping by a factor of 2 to 7 when solving full word problems compared to their math-only counterparts. Exploratory factor analysis confirms a bidimensional structure in LLM reasoning, where models exhibit distinct proficiencies in textual and mathematical components, underscoring the need for targeted improvements in language comprehension. By analyzing the latent factors associated with each model, our findings provide a framework for researchers and practitioners to make informed choices when selecting models based on computational costs and the nature of their tasks."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="calais-etal-2025-disentangling">
<titleInfo>
<title>Disentangling Text and Math in Word Problems: Evidence for the Bidimensional Structure of Large Language Models’ Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Calais</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Franco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zilu</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Themistoklis</namePart>
<namePart type="family">Nikas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wagner</namePart>
<namePart type="given">Meira</namePart>
<namePart type="family">Jr.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evimaria</namePart>
<namePart type="family">Terzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Crovella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Do LLMs process text and mathematics as a unified skill, or do these components rely on distinct underlying mechanisms? We investigate this question by disentangling the textual interpretation and mathematical solving steps in word problems drawn from Brazil’s largest college entrance exam (ENEM) and GSM8K, a popular grade school-level benchmark. Using the symbolic solver SymPy, we transform word problems into equivalent purely mathematical representations, isolating equation formulation from textual comprehension. Our extended benchmarks enable a structured analysis of LLM performance across these two dimensions. Through empirical evaluations, we find that small-scale LLMs struggle significantly more with text interpretation than with equation solving, with accuracy dropping by a factor of 2 to 7 when solving full word problems compared to their math-only counterparts. Exploratory factor analysis confirms a bidimensional structure in LLM reasoning, where models exhibit distinct proficiencies in textual and mathematical components, underscoring the need for targeted improvements in language comprehension. By analyzing the latent factors associated with each model, our findings provide a framework for researchers and practitioners to make informed choices when selecting models based on computational costs and the nature of their tasks.</abstract>
<identifier type="citekey">calais-etal-2025-disentangling</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.656</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.656/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>12671</start>
<end>12688</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Disentangling Text and Math in Word Problems: Evidence for the Bidimensional Structure of Large Language Models’ Reasoning
%A Calais, Pedro
%A Franco, Gabriel
%A Tang, Zilu
%A Nikas, Themistoklis
%A Jr., Wagner Meira
%A Terzi, Evimaria
%A Crovella, Mark
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F calais-etal-2025-disentangling
%X Do LLMs process text and mathematics as a unified skill, or do these components rely on distinct underlying mechanisms? We investigate this question by disentangling the textual interpretation and mathematical solving steps in word problems drawn from Brazil’s largest college entrance exam (ENEM) and GSM8K, a popular grade school-level benchmark. Using the symbolic solver SymPy, we transform word problems into equivalent purely mathematical representations, isolating equation formulation from textual comprehension. Our extended benchmarks enable a structured analysis of LLM performance across these two dimensions. Through empirical evaluations, we find that small-scale LLMs struggle significantly more with text interpretation than with equation solving, with accuracy dropping by a factor of 2 to 7 when solving full word problems compared to their math-only counterparts. Exploratory factor analysis confirms a bidimensional structure in LLM reasoning, where models exhibit distinct proficiencies in textual and mathematical components, underscoring the need for targeted improvements in language comprehension. By analyzing the latent factors associated with each model, our findings provide a framework for researchers and practitioners to make informed choices when selecting models based on computational costs and the nature of their tasks.
%R 10.18653/v1/2025.findings-acl.656
%U https://aclanthology.org/2025.findings-acl.656/
%U https://doi.org/10.18653/v1/2025.findings-acl.656
%P 12671-12688
Markdown (Informal)
[Disentangling Text and Math in Word Problems: Evidence for the Bidimensional Structure of Large Language Models’ Reasoning](https://aclanthology.org/2025.findings-acl.656/) (Calais et al., Findings 2025)
ACL