@inproceedings{qi-etal-2025-tmath,
title = "{TMATH} A Dataset for Evaluating Large Language Models in Generating Educational Hints for Math Word Problems",
author = "Qi, Changyong and
Wei, Yuang and
Xu, Haoxin and
Zheng, Longwei and
Chen, Peiji and
Gu, Xiaoqing",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.340/",
pages = "5082--5093",
abstract = "Large Language Models (LLMs) are increasingly being applied in education, showing significant potential in personalized instruction, student feedback, and intelligent tutoring. Generating hints for Math Word Problems (MWPs) has become a critical application, particularly in helping students understand problem-solving steps and logic. However, existing models struggle to provide pedagogically sound guidance that fosters learning without offering direct answers. To address this issue, we introduce TMATH, a dataset specifically designed to evaluate LLMs' ability to generate high-quality hints for MWPs. TMATH contains diverse mathematical problems paired with carefully crafted, human-generated hints. To assess its impact, we fine-tuned a series of 7B-scale language models using TMATH. Our results, based on quantitative evaluations and expert assessments, show that while LLMs still face challenges in complex reasoning, the TMATH dataset significantly enhances their ability to generate more accurate and contextually appropriate educational hints."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="qi-etal-2025-tmath">
<titleInfo>
<title>TMATH A Dataset for Evaluating Large Language Models in Generating Educational Hints for Math Word Problems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Changyong</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuang</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoxin</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Longwei</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peiji</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoqing</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large Language Models (LLMs) are increasingly being applied in education, showing significant potential in personalized instruction, student feedback, and intelligent tutoring. Generating hints for Math Word Problems (MWPs) has become a critical application, particularly in helping students understand problem-solving steps and logic. However, existing models struggle to provide pedagogically sound guidance that fosters learning without offering direct answers. To address this issue, we introduce TMATH, a dataset specifically designed to evaluate LLMs’ ability to generate high-quality hints for MWPs. TMATH contains diverse mathematical problems paired with carefully crafted, human-generated hints. To assess its impact, we fine-tuned a series of 7B-scale language models using TMATH. Our results, based on quantitative evaluations and expert assessments, show that while LLMs still face challenges in complex reasoning, the TMATH dataset significantly enhances their ability to generate more accurate and contextually appropriate educational hints.</abstract>
<identifier type="citekey">qi-etal-2025-tmath</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.340/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>5082</start>
<end>5093</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TMATH A Dataset for Evaluating Large Language Models in Generating Educational Hints for Math Word Problems
%A Qi, Changyong
%A Wei, Yuang
%A Xu, Haoxin
%A Zheng, Longwei
%A Chen, Peiji
%A Gu, Xiaoqing
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F qi-etal-2025-tmath
%X Large Language Models (LLMs) are increasingly being applied in education, showing significant potential in personalized instruction, student feedback, and intelligent tutoring. Generating hints for Math Word Problems (MWPs) has become a critical application, particularly in helping students understand problem-solving steps and logic. However, existing models struggle to provide pedagogically sound guidance that fosters learning without offering direct answers. To address this issue, we introduce TMATH, a dataset specifically designed to evaluate LLMs’ ability to generate high-quality hints for MWPs. TMATH contains diverse mathematical problems paired with carefully crafted, human-generated hints. To assess its impact, we fine-tuned a series of 7B-scale language models using TMATH. Our results, based on quantitative evaluations and expert assessments, show that while LLMs still face challenges in complex reasoning, the TMATH dataset significantly enhances their ability to generate more accurate and contextually appropriate educational hints.
%U https://aclanthology.org/2025.coling-main.340/
%P 5082-5093
Markdown (Informal)
[TMATH A Dataset for Evaluating Large Language Models in Generating Educational Hints for Math Word Problems](https://aclanthology.org/2025.coling-main.340/) (Qi et al., COLING 2025)
ACL