@inproceedings{kaplan-etal-2026-paracodex,
title = "{P}ara{C}odex: A Profiling-Guided Autonomous Coding Agent for Reliable Parallel Code Generation and Translation",
author = "Kaplan, Erel and
Bitan, Tomer and
Ghrayeb, Lian and
Chen, Le and
Yotam, Tom and
Hasabnis, Niranjan and
Oren, Gal",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.732/",
pages = "16113--16136",
ISBN = "979-8-89176-390-6",
abstract = "Parallel programming is central to HPC and AI, but producing code that is correct and fast remains challenging, especially for OpenMP GPU offload, where data movement and tuning dominate. Autonomous coding agents can compile, test, and profile on target hardware, but outputs are brittle without domain scaffolding.We present ParaCodex, an HPC-engineer workflow that turns a Codex-based agent into an autonomous OpenMP GPU offload system using staged hotspot analysis, explicit data planning, correctness gating, and profiling-guided refinement. We evaluate translation from serial CPU kernels to OpenMP GPU offload kernels on HeCBench, Rodinia, and NAS. After excluding five kernels, ParaCodex succeeded on all 31 valid kernels. In 27/31 (87{\%}) of these valid cases, the generated kernels improved GPU time over reference implementations, a result that holds independently on both the A100 and RTX 4060. The resulting OpenMP kernels achieve geometric-mean speedups of 3.1 (A100) and 3.6 (RTX 4060) on HeCBench and 1.5 and 1.1 on Rodinia, and outperform a zero-shot Codex baseline on all suites. We also evaluate CUDA -{\ensuremath{>}} OpenMP offload translation on ParEval, where ParaCodex maintains high compilation and validation rates in code-only and end-to-end settings.ParaCodex is available at https://github.com/Scientific-Computing-Lab/ParaCodex"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kaplan-etal-2026-paracodex">
<titleInfo>
<title>ParaCodex: A Profiling-Guided Autonomous Coding Agent for Reliable Parallel Code Generation and Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Erel</namePart>
<namePart type="family">Kaplan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomer</namePart>
<namePart type="family">Bitan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lian</namePart>
<namePart type="family">Ghrayeb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Le</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Yotam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niranjan</namePart>
<namePart type="family">Hasabnis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gal</namePart>
<namePart type="family">Oren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Parallel programming is central to HPC and AI, but producing code that is correct and fast remains challenging, especially for OpenMP GPU offload, where data movement and tuning dominate. Autonomous coding agents can compile, test, and profile on target hardware, but outputs are brittle without domain scaffolding.We present ParaCodex, an HPC-engineer workflow that turns a Codex-based agent into an autonomous OpenMP GPU offload system using staged hotspot analysis, explicit data planning, correctness gating, and profiling-guided refinement. We evaluate translation from serial CPU kernels to OpenMP GPU offload kernels on HeCBench, Rodinia, and NAS. After excluding five kernels, ParaCodex succeeded on all 31 valid kernels. In 27/31 (87%) of these valid cases, the generated kernels improved GPU time over reference implementations, a result that holds independently on both the A100 and RTX 4060. The resulting OpenMP kernels achieve geometric-mean speedups of 3.1 (A100) and 3.6 (RTX 4060) on HeCBench and 1.5 and 1.1 on Rodinia, and outperform a zero-shot Codex baseline on all suites. We also evaluate CUDA -\ensuremath> OpenMP offload translation on ParEval, where ParaCodex maintains high compilation and validation rates in code-only and end-to-end settings.ParaCodex is available at https://github.com/Scientific-Computing-Lab/ParaCodex</abstract>
<identifier type="citekey">kaplan-etal-2026-paracodex</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.732/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>16113</start>
<end>16136</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ParaCodex: A Profiling-Guided Autonomous Coding Agent for Reliable Parallel Code Generation and Translation
%A Kaplan, Erel
%A Bitan, Tomer
%A Ghrayeb, Lian
%A Chen, Le
%A Yotam, Tom
%A Hasabnis, Niranjan
%A Oren, Gal
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F kaplan-etal-2026-paracodex
%X Parallel programming is central to HPC and AI, but producing code that is correct and fast remains challenging, especially for OpenMP GPU offload, where data movement and tuning dominate. Autonomous coding agents can compile, test, and profile on target hardware, but outputs are brittle without domain scaffolding.We present ParaCodex, an HPC-engineer workflow that turns a Codex-based agent into an autonomous OpenMP GPU offload system using staged hotspot analysis, explicit data planning, correctness gating, and profiling-guided refinement. We evaluate translation from serial CPU kernels to OpenMP GPU offload kernels on HeCBench, Rodinia, and NAS. After excluding five kernels, ParaCodex succeeded on all 31 valid kernels. In 27/31 (87%) of these valid cases, the generated kernels improved GPU time over reference implementations, a result that holds independently on both the A100 and RTX 4060. The resulting OpenMP kernels achieve geometric-mean speedups of 3.1 (A100) and 3.6 (RTX 4060) on HeCBench and 1.5 and 1.1 on Rodinia, and outperform a zero-shot Codex baseline on all suites. We also evaluate CUDA -\ensuremath> OpenMP offload translation on ParEval, where ParaCodex maintains high compilation and validation rates in code-only and end-to-end settings.ParaCodex is available at https://github.com/Scientific-Computing-Lab/ParaCodex
%U https://aclanthology.org/2026.acl-long.732/
%P 16113-16136
Markdown (Informal)
[ParaCodex: A Profiling-Guided Autonomous Coding Agent for Reliable Parallel Code Generation and Translation](https://aclanthology.org/2026.acl-long.732/) (Kaplan et al., ACL 2026)
ACL
- Erel Kaplan, Tomer Bitan, Lian Ghrayeb, Le Chen, Tom Yotam, Niranjan Hasabnis, and Gal Oren. 2026. ParaCodex: A Profiling-Guided Autonomous Coding Agent for Reliable Parallel Code Generation and Translation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 16113–16136, San Diego, California, United States. Association for Computational Linguistics.