@inproceedings{liu-etal-2026-metascale,
title = "{M}eta{S}cale: Test-Time Scaling with Evolving Meta-Thoughts",
author = "Liu, Qin and
Zhou, Wenxuan and
Xu, Nan and
Huang, James Y. and
Wang, Fei and
Zhang, Sheng and
Poon, Hoifung and
Chen, Muhao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.574/",
pages = "11828--11842",
ISBN = "979-8-89176-395-1",
abstract = "One critical challenge for large language models (LLMs) in making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To address this limitation, we introduce MetaScale, a test-time scaling framework based on meta-thoughts, i.e., adaptive thinking strategies tailored to each task. MetaScale initializes a pool of candidate meta-thoughts, then iteratively selects and evaluates them using a multi-armed bandit algorithm with upper confidence bound selection, guided by a reward model. To further enhance adaptability, a genetic algorithm evolves high-reward meta-thoughts, refining and extending the strategy pool over time. By dynamically proposing and optimizing meta-thoughts at inference time, MetaScale improves both accuracy and generalization across a wide range of tasks. Experimental results demonstrate that MetaScale consistently outperforms standard inference approaches, achieving an 11{\%} performance gain in win rate on Arena-Hard with GPT-4o, improving from 82.14{\%} to 93.14{\%} against GPT-4. Notably, MetaScale scales more effectively with increasing sampling budgets and produces more structured, expert-level responses."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-metascale">
<titleInfo>
<title>MetaScale: Test-Time Scaling with Evolving Meta-Thoughts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Qin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxuan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sheng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hoifung</namePart>
<namePart type="family">Poon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhao</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>One critical challenge for large language models (LLMs) in making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To address this limitation, we introduce MetaScale, a test-time scaling framework based on meta-thoughts, i.e., adaptive thinking strategies tailored to each task. MetaScale initializes a pool of candidate meta-thoughts, then iteratively selects and evaluates them using a multi-armed bandit algorithm with upper confidence bound selection, guided by a reward model. To further enhance adaptability, a genetic algorithm evolves high-reward meta-thoughts, refining and extending the strategy pool over time. By dynamically proposing and optimizing meta-thoughts at inference time, MetaScale improves both accuracy and generalization across a wide range of tasks. Experimental results demonstrate that MetaScale consistently outperforms standard inference approaches, achieving an 11% performance gain in win rate on Arena-Hard with GPT-4o, improving from 82.14% to 93.14% against GPT-4. Notably, MetaScale scales more effectively with increasing sampling budgets and produces more structured, expert-level responses.</abstract>
<identifier type="citekey">liu-etal-2026-metascale</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.574/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>11828</start>
<end>11842</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MetaScale: Test-Time Scaling with Evolving Meta-Thoughts
%A Liu, Qin
%A Zhou, Wenxuan
%A Xu, Nan
%A Huang, James Y.
%A Wang, Fei
%A Zhang, Sheng
%A Poon, Hoifung
%A Chen, Muhao
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F liu-etal-2026-metascale
%X One critical challenge for large language models (LLMs) in making complex reasoning is their reliance on matching reasoning patterns from training data, instead of proactively selecting the most appropriate cognitive strategy to solve a given task. Existing approaches impose fixed cognitive structures that enhance performance in specific tasks but lack adaptability across diverse scenarios. To address this limitation, we introduce MetaScale, a test-time scaling framework based on meta-thoughts, i.e., adaptive thinking strategies tailored to each task. MetaScale initializes a pool of candidate meta-thoughts, then iteratively selects and evaluates them using a multi-armed bandit algorithm with upper confidence bound selection, guided by a reward model. To further enhance adaptability, a genetic algorithm evolves high-reward meta-thoughts, refining and extending the strategy pool over time. By dynamically proposing and optimizing meta-thoughts at inference time, MetaScale improves both accuracy and generalization across a wide range of tasks. Experimental results demonstrate that MetaScale consistently outperforms standard inference approaches, achieving an 11% performance gain in win rate on Arena-Hard with GPT-4o, improving from 82.14% to 93.14% against GPT-4. Notably, MetaScale scales more effectively with increasing sampling budgets and produces more structured, expert-level responses.
%U https://aclanthology.org/2026.findings-acl.574/
%P 11828-11842
Markdown (Informal)
[MetaScale: Test-Time Scaling with Evolving Meta-Thoughts](https://aclanthology.org/2026.findings-acl.574/) (Liu et al., Findings 2026)
ACL
- Qin Liu, Wenxuan Zhou, Nan Xu, James Y. Huang, Fei Wang, Sheng Zhang, Hoifung Poon, and Muhao Chen. 2026. MetaScale: Test-Time Scaling with Evolving Meta-Thoughts. In Findings of the Association for Computational Linguistics: ACL 2026, pages 11828–11842, San Diego, California, United States. Association for Computational Linguistics.