@inproceedings{yao-yadav-2025-diverse,
title = "Diverse Multi-tool Aggregation with Large Language Models for Enhanced Math Reasoning",
author = "Yao, Bohan and
Yadav, Vikas",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1377/",
doi = "10.18653/v1/2025.findings-emnlp.1377",
pages = "25264--25282",
ISBN = "979-8-89176-335-7",
abstract = "Tool usage is a proven technique for developing high-performance reasoning in large language models (LLMs). Our work is focused on emphasizing the utility of leveraging multiple diverse tools for complex reasoning tasks. We present $\textbf{Multi-TAG}$, a $\textbf{Multi}$-$\textbf{T}$ool $\textbf{AG}$gregation-based LLM framework that utilizes multiple diverse tools to solve complex math problems over multiple reasoning steps. At each reasoning step, $\textbf{Multi-TAG}$ invokes multiple tools and accepts the solution of the respective step by tools that have majority agreement on the final answer estimate. $\textbf{Multi-TAG}$ strongly outperforms several standard baselines that use individual tools with the same number of runs, highlighting the importance of multi-tool invocation for solving complex reasoning tasks. We also show that naive aggregation of multiple tools at each reasoning step also leads to substantial improvements of up to 35{\%} accuracy. $\textbf{Multi-TAG}$ then further improves these gains by 7.4{\%} on average on MATH500, AIME, AMC, and OlympiadBench."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yao-yadav-2025-diverse">
<titleInfo>
<title>Diverse Multi-tool Aggregation with Large Language Models for Enhanced Math Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bohan</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vikas</namePart>
<namePart type="family">Yadav</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Tool usage is a proven technique for developing high-performance reasoning in large language models (LLMs). Our work is focused on emphasizing the utility of leveraging multiple diverse tools for complex reasoning tasks. We present Multi-TAG, a Multi-Tool AGgregation-based LLM framework that utilizes multiple diverse tools to solve complex math problems over multiple reasoning steps. At each reasoning step, Multi-TAG invokes multiple tools and accepts the solution of the respective step by tools that have majority agreement on the final answer estimate. Multi-TAG strongly outperforms several standard baselines that use individual tools with the same number of runs, highlighting the importance of multi-tool invocation for solving complex reasoning tasks. We also show that naive aggregation of multiple tools at each reasoning step also leads to substantial improvements of up to 35% accuracy. Multi-TAG then further improves these gains by 7.4% on average on MATH500, AIME, AMC, and OlympiadBench.</abstract>
<identifier type="citekey">yao-yadav-2025-diverse</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1377</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1377/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>25264</start>
<end>25282</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Diverse Multi-tool Aggregation with Large Language Models for Enhanced Math Reasoning
%A Yao, Bohan
%A Yadav, Vikas
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F yao-yadav-2025-diverse
%X Tool usage is a proven technique for developing high-performance reasoning in large language models (LLMs). Our work is focused on emphasizing the utility of leveraging multiple diverse tools for complex reasoning tasks. We present Multi-TAG, a Multi-Tool AGgregation-based LLM framework that utilizes multiple diverse tools to solve complex math problems over multiple reasoning steps. At each reasoning step, Multi-TAG invokes multiple tools and accepts the solution of the respective step by tools that have majority agreement on the final answer estimate. Multi-TAG strongly outperforms several standard baselines that use individual tools with the same number of runs, highlighting the importance of multi-tool invocation for solving complex reasoning tasks. We also show that naive aggregation of multiple tools at each reasoning step also leads to substantial improvements of up to 35% accuracy. Multi-TAG then further improves these gains by 7.4% on average on MATH500, AIME, AMC, and OlympiadBench.
%R 10.18653/v1/2025.findings-emnlp.1377
%U https://aclanthology.org/2025.findings-emnlp.1377/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1377
%P 25264-25282
Markdown (Informal)
[Diverse Multi-tool Aggregation with Large Language Models for Enhanced Math Reasoning](https://aclanthology.org/2025.findings-emnlp.1377/) (Yao & Yadav, Findings 2025)
ACL