@inproceedings{lin-etal-2025-weak2wise,
title = "{W}eak2{W}ise: An Automated, Lightweight Framework for Weak-{LLM}-Friendly Reasoning Synthesis",
author = "Lin, Jianing and
Guo, Yuanfang and
Liu, Shunning and
Liu, Zeming and
Wang, Yunhong",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1070/",
pages = "19639--19657",
ISBN = "979-8-89176-335-7",
abstract = "Recent advances in large language model (LLM) fine{-}tuning have shown that training data augmented with high-quality reasoning traces can remarkably improve downstream performance. However, existing approaches usually rely on expensive manual annotations or auxiliary models, and fail to address the unique constraints of smaller ``weak'' LLMs. To bridge these gaps, we introduce Weak2Wise, a fully automated, lightweight framework for synthesizing high{-}quality, weak-LLM-friendly reasoning traces. Starting from a QA dataset, Weak2Wise filters out the samples that can already be correctly answered by the weak LLM, gathers diverse candidate reasoning traces from multiple strong LLMs, and leverages our Step{-}Mask scoring to rank and truncate the most guidance{-}effective traces. These reasoning traces are then used for fine{-}tuning, yielding substantial improvements in the weak LLM{'}s reasoning abilities. The name Weak2Wise has two meanings: using a ``weak'' LLM to select the ``wisest'' reasoning traces generated by stronger LLMs, and fine{-}tuning the same weak LLM on these reasoning traces to become ``wiser''. We further use Weak2Wise to build GR-1K, a 1,000{-}sample math and science QA{-}reasoning dataset optimized for weak LLMs, and fine{-}tune Qwen2.5{-}7B on it to create GR{-}7B, which achieves superior performance on AIME2024, MATH{-}500, and GPQA Diamond benchmarks. Our codes are publicly released to facilitate further research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lin-etal-2025-weak2wise">
<titleInfo>
<title>Weak2Wise: An Automated, Lightweight Framework for Weak-LLM-Friendly Reasoning Synthesis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianing</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanfang</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shunning</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunhong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Recent advances in large language model (LLM) fine-tuning have shown that training data augmented with high-quality reasoning traces can remarkably improve downstream performance. However, existing approaches usually rely on expensive manual annotations or auxiliary models, and fail to address the unique constraints of smaller “weak” LLMs. To bridge these gaps, we introduce Weak2Wise, a fully automated, lightweight framework for synthesizing high-quality, weak-LLM-friendly reasoning traces. Starting from a QA dataset, Weak2Wise filters out the samples that can already be correctly answered by the weak LLM, gathers diverse candidate reasoning traces from multiple strong LLMs, and leverages our Step-Mask scoring to rank and truncate the most guidance-effective traces. These reasoning traces are then used for fine-tuning, yielding substantial improvements in the weak LLM’s reasoning abilities. The name Weak2Wise has two meanings: using a “weak” LLM to select the “wisest” reasoning traces generated by stronger LLMs, and fine-tuning the same weak LLM on these reasoning traces to become “wiser”. We further use Weak2Wise to build GR-1K, a 1,000-sample math and science QA-reasoning dataset optimized for weak LLMs, and fine-tune Qwen2.5-7B on it to create GR-7B, which achieves superior performance on AIME2024, MATH-500, and GPQA Diamond benchmarks. Our codes are publicly released to facilitate further research.</abstract>
<identifier type="citekey">lin-etal-2025-weak2wise</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1070/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>19639</start>
<end>19657</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Weak2Wise: An Automated, Lightweight Framework for Weak-LLM-Friendly Reasoning Synthesis
%A Lin, Jianing
%A Guo, Yuanfang
%A Liu, Shunning
%A Liu, Zeming
%A Wang, Yunhong
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F lin-etal-2025-weak2wise
%X Recent advances in large language model (LLM) fine-tuning have shown that training data augmented with high-quality reasoning traces can remarkably improve downstream performance. However, existing approaches usually rely on expensive manual annotations or auxiliary models, and fail to address the unique constraints of smaller “weak” LLMs. To bridge these gaps, we introduce Weak2Wise, a fully automated, lightweight framework for synthesizing high-quality, weak-LLM-friendly reasoning traces. Starting from a QA dataset, Weak2Wise filters out the samples that can already be correctly answered by the weak LLM, gathers diverse candidate reasoning traces from multiple strong LLMs, and leverages our Step-Mask scoring to rank and truncate the most guidance-effective traces. These reasoning traces are then used for fine-tuning, yielding substantial improvements in the weak LLM’s reasoning abilities. The name Weak2Wise has two meanings: using a “weak” LLM to select the “wisest” reasoning traces generated by stronger LLMs, and fine-tuning the same weak LLM on these reasoning traces to become “wiser”. We further use Weak2Wise to build GR-1K, a 1,000-sample math and science QA-reasoning dataset optimized for weak LLMs, and fine-tune Qwen2.5-7B on it to create GR-7B, which achieves superior performance on AIME2024, MATH-500, and GPQA Diamond benchmarks. Our codes are publicly released to facilitate further research.
%U https://aclanthology.org/2025.findings-emnlp.1070/
%P 19639-19657
Markdown (Informal)
[Weak2Wise: An Automated, Lightweight Framework for Weak-LLM-Friendly Reasoning Synthesis](https://aclanthology.org/2025.findings-emnlp.1070/) (Lin et al., Findings 2025)
ACL