@inproceedings{liu-etal-2026-discover,
title = "Discover and Prove: An Open-source Agentic Framework for Hard Mode Automated Theorem Proving in Lean 4",
author = "Liu, Chengwu and
Yin, Yichun and
Yuan, Ye and
Xie, Jiaxuan and
Li, Botao and
Li, Siqi and
Shen, Jianhao and
Xu, Yan and
Shang, Lifeng and
Zhang, Ming",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.3/",
pages = "117--133",
ISBN = "979-8-89176-390-6",
abstract = "Most ATP benchmarks embed the final answer within the formal statement {---} a convention we call ``Easy Mode'' {---} a design that simplifies the task relative to what human competitors face and may lead to optimistic estimates of model capability.We call the stricter, more realistic setting ``Hard Mode'': the system must independently discover the answer before constructing a formal proof.To enable Hard Mode research, we make two contributions.First, we release MiniF2F-Hard and FIMO-Hard, expert-reannotated Hard Mode variants of two widely-used ATP benchmarks.Second, we introduce Discover And Prove (DAP), an agentic framework that uses LLM natural-language reasoning with explicit self-reflection to discover answers, then rewrites Hard Mode statements into Easy Mode ones for existing ATP provers.DAP sets the state of the art: on CombiBench it raises solved problems from 7 (previous SOTA, Pass@16) to 10; on PutnamBench it is the first system to formally prove 36 theorems in Hard Mode {---} while simultaneously revealing that state-of-the-art LLMs exceed 80{\%} answer accuracy on the same problems where formal provers manage under 10{\%}, exposing a substantial gap that Hard Mode benchmarks are uniquely suited to measure."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-discover">
<titleInfo>
<title>Discover and Prove: An Open-source Agentic Framework for Hard Mode Automated Theorem Proving in Lean 4</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengwu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yichun</namePart>
<namePart type="family">Yin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ye</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaxuan</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Botao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siqi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianhao</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lifeng</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Most ATP benchmarks embed the final answer within the formal statement — a convention we call “Easy Mode” — a design that simplifies the task relative to what human competitors face and may lead to optimistic estimates of model capability.We call the stricter, more realistic setting “Hard Mode”: the system must independently discover the answer before constructing a formal proof.To enable Hard Mode research, we make two contributions.First, we release MiniF2F-Hard and FIMO-Hard, expert-reannotated Hard Mode variants of two widely-used ATP benchmarks.Second, we introduce Discover And Prove (DAP), an agentic framework that uses LLM natural-language reasoning with explicit self-reflection to discover answers, then rewrites Hard Mode statements into Easy Mode ones for existing ATP provers.DAP sets the state of the art: on CombiBench it raises solved problems from 7 (previous SOTA, Pass@16) to 10; on PutnamBench it is the first system to formally prove 36 theorems in Hard Mode — while simultaneously revealing that state-of-the-art LLMs exceed 80% answer accuracy on the same problems where formal provers manage under 10%, exposing a substantial gap that Hard Mode benchmarks are uniquely suited to measure.</abstract>
<identifier type="citekey">liu-etal-2026-discover</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.3/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>117</start>
<end>133</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Discover and Prove: An Open-source Agentic Framework for Hard Mode Automated Theorem Proving in Lean 4
%A Liu, Chengwu
%A Yin, Yichun
%A Yuan, Ye
%A Xie, Jiaxuan
%A Li, Botao
%A Li, Siqi
%A Shen, Jianhao
%A Xu, Yan
%A Shang, Lifeng
%A Zhang, Ming
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F liu-etal-2026-discover
%X Most ATP benchmarks embed the final answer within the formal statement — a convention we call “Easy Mode” — a design that simplifies the task relative to what human competitors face and may lead to optimistic estimates of model capability.We call the stricter, more realistic setting “Hard Mode”: the system must independently discover the answer before constructing a formal proof.To enable Hard Mode research, we make two contributions.First, we release MiniF2F-Hard and FIMO-Hard, expert-reannotated Hard Mode variants of two widely-used ATP benchmarks.Second, we introduce Discover And Prove (DAP), an agentic framework that uses LLM natural-language reasoning with explicit self-reflection to discover answers, then rewrites Hard Mode statements into Easy Mode ones for existing ATP provers.DAP sets the state of the art: on CombiBench it raises solved problems from 7 (previous SOTA, Pass@16) to 10; on PutnamBench it is the first system to formally prove 36 theorems in Hard Mode — while simultaneously revealing that state-of-the-art LLMs exceed 80% answer accuracy on the same problems where formal provers manage under 10%, exposing a substantial gap that Hard Mode benchmarks are uniquely suited to measure.
%U https://aclanthology.org/2026.acl-long.3/
%P 117-133
Markdown (Informal)
[Discover and Prove: An Open-source Agentic Framework for Hard Mode Automated Theorem Proving in Lean 4](https://aclanthology.org/2026.acl-long.3/) (Liu et al., ACL 2026)
ACL
- Chengwu Liu, Yichun Yin, Ye Yuan, Jiaxuan Xie, Botao Li, Siqi Li, Jianhao Shen, Yan Xu, Lifeng Shang, and Ming Zhang. 2026. Discover and Prove: An Open-source Agentic Framework for Hard Mode Automated Theorem Proving in Lean 4. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 117–133, San Diego, California, United States. Association for Computational Linguistics.