@inproceedings{fedrushkov-etal-2026-semantic,
title = "Semantic vs. Structural Signals: Log-Probability and {LLM}-as-a-Judge for Reference-Free Code Evaluation",
author = "Fedrushkov, Dmitriy and
He, Yulong and
Smirnov, Ivan and
Aliev, Artem and
Kovalchuk, Sergey",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.55/",
pages = "574--581",
ISBN = "979-8-89176-423-1",
abstract = "Reference-free evaluation of LLM-generated code is essential when execution-based testing is unavailable or costly. We compare two paradigms: $\textbf{explicit LLM-as-a-Judge}$ scoring, which assigns a quality score to a solution, and $\textbf{log-probability scoring}$, which uses $\log P_\theta(\text{code} \mid \text{task})$ as an instruction-free signal.Across HumanEval-X, we find that the two approaches capture $\textit{qualitatively different aspects}$ of code correctness. Explicit judges {---} particularly larger models {---} perform strongly on generated code, reflecting their ability to reason about task-solution alignment, but fail to distinguish correct solutions from minimally mutated ones. Log-probability exhibits the opposite pattern: weaker performance on generated code, but consistent pairwise separation of canonical from mutated solutions.These results reveal a $\textbf{discrimination-ranking dissociation}$ and show that the two paradigms provide complementary, non-interchangeable signals: explicit judges capture semantic correctness, while log-probability captures local structural consistency."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fedrushkov-etal-2026-semantic">
<titleInfo>
<title>Semantic vs. Structural Signals: Log-Probability and LLM-as-a-Judge for Reference-Free Code Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmitriy</namePart>
<namePart type="family">Fedrushkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Smirnov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Aliev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sergey</namePart>
<namePart type="family">Kovalchuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Reference-free evaluation of LLM-generated code is essential when execution-based testing is unavailable or costly. We compare two paradigms: explicit LLM-as-a-Judge scoring, which assigns a quality score to a solution, and log-probability scoring, which uses łog P_θ(\textcode \mid \texttask) as an instruction-free signal.Across HumanEval-X, we find that the two approaches capture qualitatively different aspects of code correctness. Explicit judges — particularly larger models — perform strongly on generated code, reflecting their ability to reason about task-solution alignment, but fail to distinguish correct solutions from minimally mutated ones. Log-probability exhibits the opposite pattern: weaker performance on generated code, but consistent pairwise separation of canonical from mutated solutions.These results reveal a discrimination-ranking dissociation and show that the two paradigms provide complementary, non-interchangeable signals: explicit judges capture semantic correctness, while log-probability captures local structural consistency.</abstract>
<identifier type="citekey">fedrushkov-etal-2026-semantic</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.55/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>574</start>
<end>581</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semantic vs. Structural Signals: Log-Probability and LLM-as-a-Judge for Reference-Free Code Evaluation
%A Fedrushkov, Dmitriy
%A He, Yulong
%A Smirnov, Ivan
%A Aliev, Artem
%A Kovalchuk, Sergey
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F fedrushkov-etal-2026-semantic
%X Reference-free evaluation of LLM-generated code is essential when execution-based testing is unavailable or costly. We compare two paradigms: explicit LLM-as-a-Judge scoring, which assigns a quality score to a solution, and log-probability scoring, which uses łog P_θ(\textcode \mid \texttask) as an instruction-free signal.Across HumanEval-X, we find that the two approaches capture qualitatively different aspects of code correctness. Explicit judges — particularly larger models — perform strongly on generated code, reflecting their ability to reason about task-solution alignment, but fail to distinguish correct solutions from minimally mutated ones. Log-probability exhibits the opposite pattern: weaker performance on generated code, but consistent pairwise separation of canonical from mutated solutions.These results reveal a discrimination-ranking dissociation and show that the two paradigms provide complementary, non-interchangeable signals: explicit judges capture semantic correctness, while log-probability captures local structural consistency.
%U https://aclanthology.org/2026.gem-main.55/
%P 574-581
Markdown (Informal)
[Semantic vs. Structural Signals: Log-Probability and LLM-as-a-Judge for Reference-Free Code Evaluation](https://aclanthology.org/2026.gem-main.55/) (Fedrushkov et al., GEM 2026)
ACL