@inproceedings{dias-alexiou-etal-2025-depth,
title = "An in-depth human study of the mathematical reasoning abilities in Large Language Models",
author = "Dias-Alexiou, Carolina and
Marrese-Taylor, Edison and
Matsuo, Yutaka",
editor = "Valentino, Marco and
Ferreira, Deborah and
Thayaparan, Mokanarangan and
Ranaldi, Leonardo and
Freitas, Andre",
booktitle = "Proceedings of The 3rd Workshop on Mathematical Natural Language Processing (MathNLP 2025)",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.mathnlp-main.14/",
pages = "186--194",
ISBN = "979-8-89176-348-7",
abstract = "We study the generalization capabilities of large language models (LLM) through the lens of mathematical reasoning, asking if these models can recognize that two structures are the same even when they do not share the same nomenclature. We propose a human study to evaluate if LLMs reproduce proofs that they have most likely seen during training, but when the symbols do not match the ones seen. To test this in a controlled scenario, we look at proofs in \textit{propositional calculus}, foundational for other logic systems, semantically complete and widely discussed online. We replace the implication operator ($\rightarrow$) with an unrelated, arbitrary symbol ($\spadesuit$) and ask experts to evaluate how the output of a selection of LLMs changes in terms of compliance, correctness, extensiveness and coherence. Our results show that nearly all our tested models produce lower quality proofs in this test, in particular open-weights models, suggesting the abilities of these LLMs to reason in this context have important limitations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dias-alexiou-etal-2025-depth">
<titleInfo>
<title>An in-depth human study of the mathematical reasoning abilities in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carolina</namePart>
<namePart type="family">Dias-Alexiou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edison</namePart>
<namePart type="family">Marrese-Taylor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutaka</namePart>
<namePart type="family">Matsuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of The 3rd Workshop on Mathematical Natural Language Processing (MathNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Valentino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deborah</namePart>
<namePart type="family">Ferreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mokanarangan</namePart>
<namePart type="family">Thayaparan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Ranaldi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-348-7</identifier>
</relatedItem>
<abstract>We study the generalization capabilities of large language models (LLM) through the lens of mathematical reasoning, asking if these models can recognize that two structures are the same even when they do not share the same nomenclature. We propose a human study to evaluate if LLMs reproduce proofs that they have most likely seen during training, but when the symbols do not match the ones seen. To test this in a controlled scenario, we look at proofs in propositional calculus, foundational for other logic systems, semantically complete and widely discussed online. We replace the implication operator (\rightarrow) with an unrelated, arbitrary symbol (\spadesuit) and ask experts to evaluate how the output of a selection of LLMs changes in terms of compliance, correctness, extensiveness and coherence. Our results show that nearly all our tested models produce lower quality proofs in this test, in particular open-weights models, suggesting the abilities of these LLMs to reason in this context have important limitations.</abstract>
<identifier type="citekey">dias-alexiou-etal-2025-depth</identifier>
<location>
<url>https://aclanthology.org/2025.mathnlp-main.14/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>186</start>
<end>194</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An in-depth human study of the mathematical reasoning abilities in Large Language Models
%A Dias-Alexiou, Carolina
%A Marrese-Taylor, Edison
%A Matsuo, Yutaka
%Y Valentino, Marco
%Y Ferreira, Deborah
%Y Thayaparan, Mokanarangan
%Y Ranaldi, Leonardo
%Y Freitas, Andre
%S Proceedings of The 3rd Workshop on Mathematical Natural Language Processing (MathNLP 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-348-7
%F dias-alexiou-etal-2025-depth
%X We study the generalization capabilities of large language models (LLM) through the lens of mathematical reasoning, asking if these models can recognize that two structures are the same even when they do not share the same nomenclature. We propose a human study to evaluate if LLMs reproduce proofs that they have most likely seen during training, but when the symbols do not match the ones seen. To test this in a controlled scenario, we look at proofs in propositional calculus, foundational for other logic systems, semantically complete and widely discussed online. We replace the implication operator (\rightarrow) with an unrelated, arbitrary symbol (\spadesuit) and ask experts to evaluate how the output of a selection of LLMs changes in terms of compliance, correctness, extensiveness and coherence. Our results show that nearly all our tested models produce lower quality proofs in this test, in particular open-weights models, suggesting the abilities of these LLMs to reason in this context have important limitations.
%U https://aclanthology.org/2025.mathnlp-main.14/
%P 186-194
Markdown (Informal)
[An in-depth human study of the mathematical reasoning abilities in Large Language Models](https://aclanthology.org/2025.mathnlp-main.14/) (Dias-Alexiou et al., MathNLP 2025)
ACL