@inproceedings{huang-etal-2025-opencoder,
title = "{O}pen{C}oder: The Open Cookbook for Top-Tier Code Large Language Models",
author = "Huang, Siming and
Cheng, Tianhao and
Liu, Jason Klein and
Xu, Weidi and
Hao, Jiaran and
Song, Liuyihan and
Xu, Yang and
Yang, Jian and
Liu, Jiaheng and
Zhang, Chenchen and
Chai, Linzheng and
Yuan, Ruifeng and
Luo, Xianzhen and
Wang, Qiufeng and
Fan, YuanTao and
Zhu, Qingfu and
Zhang, Zhaoxiang and
Gao, Yang and
Fu, Jie and
Liu, Qian and
Li, Houyi and
Zhang, Ge and
Qi, Yuan and
Yinghui, Xu and
Chu, Wei and
Wang, Zili",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1591/",
doi = "10.18653/v1/2025.acl-long.1591",
pages = "33167--33193",
ISBN = "979-8-89176-251-0",
abstract = "Code LLMs have been widely used in various domains, including code generation, logical reasoning, and agent systems. However, open-access code LLMs mostly only release weights, lacking key features such as reproducible data pipelines and transparent training protocols, which are crucial for advancing deeper, more reliable investigations. To address the gap, we introduce OpenCoder, a top-tier code LLM that not only achieves performance comparable to leading models but also serves as an ``open cookbook'' for the research community. Unlike most prior efforts, we release not only model weights and inference code, but also the reproducible training data, complete data processing pipeline, rigorous experimental ablation results, and detailed training protocols for open scientific research. Our work identifies the key ingredients for building a top-tier code LLM: optimized heuristic rules for data cleaning and deduplication, effective recall of code-related text corpus, and high-quality synthetic data for both annealing and supervised fine-tuning stages. By offering this level of openness, we aim to broaden access to all aspects of a top-tier code LLM, with OpenCoder serving as both a powerful model and an open foundation to accelerate research and enable reproducible advancements in code intelligence. The released resource is available at https://opencoder-llm.github.io."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="huang-etal-2025-opencoder">
<titleInfo>
<title>OpenCoder: The Open Cookbook for Top-Tier Code Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siming</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tianhao</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jason</namePart>
<namePart type="given">Klein</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weidi</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaran</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liuyihan</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiaheng</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenchen</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Linzheng</namePart>
<namePart type="family">Chai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruifeng</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xianzhen</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiufeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">YuanTao</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingfu</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhaoxiang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Fu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houyi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ge</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuan</namePart>
<namePart type="family">Qi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xu</namePart>
<namePart type="family">Yinghui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Chu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zili</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Code LLMs have been widely used in various domains, including code generation, logical reasoning, and agent systems. However, open-access code LLMs mostly only release weights, lacking key features such as reproducible data pipelines and transparent training protocols, which are crucial for advancing deeper, more reliable investigations. To address the gap, we introduce OpenCoder, a top-tier code LLM that not only achieves performance comparable to leading models but also serves as an “open cookbook” for the research community. Unlike most prior efforts, we release not only model weights and inference code, but also the reproducible training data, complete data processing pipeline, rigorous experimental ablation results, and detailed training protocols for open scientific research. Our work identifies the key ingredients for building a top-tier code LLM: optimized heuristic rules for data cleaning and deduplication, effective recall of code-related text corpus, and high-quality synthetic data for both annealing and supervised fine-tuning stages. By offering this level of openness, we aim to broaden access to all aspects of a top-tier code LLM, with OpenCoder serving as both a powerful model and an open foundation to accelerate research and enable reproducible advancements in code intelligence. The released resource is available at https://opencoder-llm.github.io.</abstract>
<identifier type="citekey">huang-etal-2025-opencoder</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1591</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1591/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>33167</start>
<end>33193</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OpenCoder: The Open Cookbook for Top-Tier Code Large Language Models
%A Huang, Siming
%A Cheng, Tianhao
%A Liu, Jason Klein
%A Xu, Weidi
%A Hao, Jiaran
%A Song, Liuyihan
%A Xu, Yang
%A Yang, Jian
%A Liu, Jiaheng
%A Zhang, Chenchen
%A Chai, Linzheng
%A Yuan, Ruifeng
%A Luo, Xianzhen
%A Wang, Qiufeng
%A Fan, YuanTao
%A Zhu, Qingfu
%A Zhang, Zhaoxiang
%A Gao, Yang
%A Fu, Jie
%A Liu, Qian
%A Li, Houyi
%A Zhang, Ge
%A Qi, Yuan
%A Yinghui, Xu
%A Chu, Wei
%A Wang, Zili
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F huang-etal-2025-opencoder
%X Code LLMs have been widely used in various domains, including code generation, logical reasoning, and agent systems. However, open-access code LLMs mostly only release weights, lacking key features such as reproducible data pipelines and transparent training protocols, which are crucial for advancing deeper, more reliable investigations. To address the gap, we introduce OpenCoder, a top-tier code LLM that not only achieves performance comparable to leading models but also serves as an “open cookbook” for the research community. Unlike most prior efforts, we release not only model weights and inference code, but also the reproducible training data, complete data processing pipeline, rigorous experimental ablation results, and detailed training protocols for open scientific research. Our work identifies the key ingredients for building a top-tier code LLM: optimized heuristic rules for data cleaning and deduplication, effective recall of code-related text corpus, and high-quality synthetic data for both annealing and supervised fine-tuning stages. By offering this level of openness, we aim to broaden access to all aspects of a top-tier code LLM, with OpenCoder serving as both a powerful model and an open foundation to accelerate research and enable reproducible advancements in code intelligence. The released resource is available at https://opencoder-llm.github.io.
%R 10.18653/v1/2025.acl-long.1591
%U https://aclanthology.org/2025.acl-long.1591/
%U https://doi.org/10.18653/v1/2025.acl-long.1591
%P 33167-33193
Markdown (Informal)
[OpenCoder: The Open Cookbook for Top-Tier Code Large Language Models](https://aclanthology.org/2025.acl-long.1591/) (Huang et al., ACL 2025)
ACL
- Siming Huang, Tianhao Cheng, Jason Klein Liu, Weidi Xu, Jiaran Hao, Liuyihan Song, Yang Xu, Jian Yang, Jiaheng Liu, Chenchen Zhang, Linzheng Chai, Ruifeng Yuan, Xianzhen Luo, Qiufeng Wang, YuanTao Fan, Qingfu Zhu, Zhaoxiang Zhang, Yang Gao, Jie Fu, Qian Liu, Houyi Li, Ge Zhang, Yuan Qi, Xu Yinghui, Wei Chu, and Zili Wang. 2025. OpenCoder: The Open Cookbook for Top-Tier Code Large Language Models. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 33167–33193, Vienna, Austria. Association for Computational Linguistics.