@inproceedings{cao-etal-2026-ascendkernelgen,
title = "{A}scend{K}ernel{G}en: {LLM}-Driven Kernel Generation for {NPU}s",
author = "Cao, Xinzi and
Zhai, Jianyang and
Li, Pengfei and
Hu, Zhiheng and
Yan, Cen and
Mubingxu and
Fang, Guanghuan and
She, Bin and
Li, Jiayu and
Su, Yihan and
Tao, Dongyang and
Yang, Feidiao and
Wang, Chang-Dong and
Lu, Yutong and
Xue, Weicheng and
Zhou, Bin and
Tian, Yonghong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1533/",
pages = "30693--30718",
ISBN = "979-8-89176-395-1",
abstract = "Neural Processing Units (NPUs) are critical for AI infrastructure, yet developing kernels remains a bottleneck due to the complexity of vendor-specific Domain-Specific Languages (DSLs). While LLMs excel in general coding, they fail to meet the stringent constraints of NPU development, showing a near-zero success rate on complex kernels in our preliminary study. To address these challenges, we present AscendKernelGen, the first comprehensive framework for NPU kernel development, marking a pioneering effort in this field. This framework consists of three interconnected components: (1) Ascend-CoT, the first dataset in the NPU kernel domain that incorporates chain-of-thought reasoning from real-world kernel implementations; (2) KernelGen-LM, a domain-adaptive model trained on this novel dataset using supervised fine-tuning and reinforcement learning; and (3) NPUKernelBench, the first benchmark platform designed to evaluate the compilation, correctness, and performance of generated NPU kernels. Experimental results demonstrate that our approach dramatically bridges the gap in hardware-specific coding: compilation success on complex Level-2 kernels improves from 0{\%} to 95.5{\%} (Pass@10), with 64{\%} functional correctness. AscendKernGen is available at AscendKernGen and NPUKernelBench."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cao-etal-2026-ascendkernelgen">
<titleInfo>
<title>AscendKernelGen: LLM-Driven Kernel Generation for NPUs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinzi</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianyang</namePart>
<namePart type="family">Zhai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pengfei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhiheng</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cen</namePart>
<namePart type="family">Yan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Mubingxu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanghuan</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">She</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihan</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dongyang</namePart>
<namePart type="family">Tao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feidiao</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chang-Dong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yutong</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weicheng</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yonghong</namePart>
<namePart type="family">Tian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Neural Processing Units (NPUs) are critical for AI infrastructure, yet developing kernels remains a bottleneck due to the complexity of vendor-specific Domain-Specific Languages (DSLs). While LLMs excel in general coding, they fail to meet the stringent constraints of NPU development, showing a near-zero success rate on complex kernels in our preliminary study. To address these challenges, we present AscendKernelGen, the first comprehensive framework for NPU kernel development, marking a pioneering effort in this field. This framework consists of three interconnected components: (1) Ascend-CoT, the first dataset in the NPU kernel domain that incorporates chain-of-thought reasoning from real-world kernel implementations; (2) KernelGen-LM, a domain-adaptive model trained on this novel dataset using supervised fine-tuning and reinforcement learning; and (3) NPUKernelBench, the first benchmark platform designed to evaluate the compilation, correctness, and performance of generated NPU kernels. Experimental results demonstrate that our approach dramatically bridges the gap in hardware-specific coding: compilation success on complex Level-2 kernels improves from 0% to 95.5% (Pass@10), with 64% functional correctness. AscendKernGen is available at AscendKernGen and NPUKernelBench.</abstract>
<identifier type="citekey">cao-etal-2026-ascendkernelgen</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1533/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30693</start>
<end>30718</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AscendKernelGen: LLM-Driven Kernel Generation for NPUs
%A Cao, Xinzi
%A Zhai, Jianyang
%A Li, Pengfei
%A Hu, Zhiheng
%A Yan, Cen
%A Fang, Guanghuan
%A She, Bin
%A Li, Jiayu
%A Su, Yihan
%A Tao, Dongyang
%A Yang, Feidiao
%A Wang, Chang-Dong
%A Lu, Yutong
%A Xue, Weicheng
%A Zhou, Bin
%A Tian, Yonghong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%A Mubingxu
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F cao-etal-2026-ascendkernelgen
%X Neural Processing Units (NPUs) are critical for AI infrastructure, yet developing kernels remains a bottleneck due to the complexity of vendor-specific Domain-Specific Languages (DSLs). While LLMs excel in general coding, they fail to meet the stringent constraints of NPU development, showing a near-zero success rate on complex kernels in our preliminary study. To address these challenges, we present AscendKernelGen, the first comprehensive framework for NPU kernel development, marking a pioneering effort in this field. This framework consists of three interconnected components: (1) Ascend-CoT, the first dataset in the NPU kernel domain that incorporates chain-of-thought reasoning from real-world kernel implementations; (2) KernelGen-LM, a domain-adaptive model trained on this novel dataset using supervised fine-tuning and reinforcement learning; and (3) NPUKernelBench, the first benchmark platform designed to evaluate the compilation, correctness, and performance of generated NPU kernels. Experimental results demonstrate that our approach dramatically bridges the gap in hardware-specific coding: compilation success on complex Level-2 kernels improves from 0% to 95.5% (Pass@10), with 64% functional correctness. AscendKernGen is available at AscendKernGen and NPUKernelBench.
%U https://aclanthology.org/2026.findings-acl.1533/
%P 30693-30718
Markdown (Informal)
[AscendKernelGen: LLM-Driven Kernel Generation for NPUs](https://aclanthology.org/2026.findings-acl.1533/) (Cao et al., Findings 2026)
ACL
- Xinzi Cao, Jianyang Zhai, Pengfei Li, Zhiheng Hu, Cen Yan, Mubingxu, Guanghuan Fang, Bin She, Jiayu Li, Yihan Su, Dongyang Tao, Feidiao Yang, Chang-Dong Wang, Yutong Lu, Weicheng Xue, Bin Zhou, and Yonghong Tian. 2026. AscendKernelGen: LLM-Driven Kernel Generation for NPUs. In Findings of the Association for Computational Linguistics: ACL 2026, pages 30693–30718, San Diego, California, United States. Association for Computational Linguistics.