@inproceedings{kumar-etal-2026-odasim,
title = "{ODAS}im: Ordered, Distinctive and Absolute Semantic Similarity for Code Explanation Evaluation",
author = "Kumar, Prince and
Munigala, Vitobha and
Sen, Jaydeep and
Mittal, Ashish and
Kumar, Vishwajeet and
Tamilselvam, Srikanth G.",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1415/",
pages = "28390--28403",
ISBN = "979-8-89176-395-1",
abstract = "Code explanations are increasingly generated by large language models and used in software engineering workflows, making reliable evaluation essential. However, existing model-based and embedding-based methods often fail to distinguish correct explanations from partially or fully incorrect ones, and their similarity scores are poorly calibrated and do not reflect meaningful differences in explanation quality. To address this, we propose ODASim(Orderly, Dstinctive, and Absolute Similarity), a model-agnostic graded fine-tuning framework for embedding models that learns calibrated similarity representations between code and explanations. To support fine-grained supervision and evaluation, we also introduce ODA-X, a novel benchmark for code-to-explanation quality grading, comprising code{--}explanation pairs graded similarity labels derived from strategic perturbations of gold explanations. We apply our ODASim approach to multiple embedding models and evaluate it on two benchmarks: widely popular CodeXGLUE and our proposed benchmark ODA-X, spanning four programming languages - Python, Java, JavaScript, and Go. Results show that our method achieves up to 35{\%} improvement in F1 score and 85{\%} reduction in Expected Calibration Error (ECE), enabling reliable evaluation of code to explanation quality."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kumar-etal-2026-odasim">
<titleInfo>
<title>ODASim: Ordered, Distinctive and Absolute Semantic Similarity for Code Explanation Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prince</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vitobha</namePart>
<namePart type="family">Munigala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaydeep</namePart>
<namePart type="family">Sen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Mittal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vishwajeet</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Srikanth</namePart>
<namePart type="given">G</namePart>
<namePart type="family">Tamilselvam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Code explanations are increasingly generated by large language models and used in software engineering workflows, making reliable evaluation essential. However, existing model-based and embedding-based methods often fail to distinguish correct explanations from partially or fully incorrect ones, and their similarity scores are poorly calibrated and do not reflect meaningful differences in explanation quality. To address this, we propose ODASim(Orderly, Dstinctive, and Absolute Similarity), a model-agnostic graded fine-tuning framework for embedding models that learns calibrated similarity representations between code and explanations. To support fine-grained supervision and evaluation, we also introduce ODA-X, a novel benchmark for code-to-explanation quality grading, comprising code–explanation pairs graded similarity labels derived from strategic perturbations of gold explanations. We apply our ODASim approach to multiple embedding models and evaluate it on two benchmarks: widely popular CodeXGLUE and our proposed benchmark ODA-X, spanning four programming languages - Python, Java, JavaScript, and Go. Results show that our method achieves up to 35% improvement in F1 score and 85% reduction in Expected Calibration Error (ECE), enabling reliable evaluation of code to explanation quality.</abstract>
<identifier type="citekey">kumar-etal-2026-odasim</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1415/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>28390</start>
<end>28403</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ODASim: Ordered, Distinctive and Absolute Semantic Similarity for Code Explanation Evaluation
%A Kumar, Prince
%A Munigala, Vitobha
%A Sen, Jaydeep
%A Mittal, Ashish
%A Kumar, Vishwajeet
%A Tamilselvam, Srikanth G.
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F kumar-etal-2026-odasim
%X Code explanations are increasingly generated by large language models and used in software engineering workflows, making reliable evaluation essential. However, existing model-based and embedding-based methods often fail to distinguish correct explanations from partially or fully incorrect ones, and their similarity scores are poorly calibrated and do not reflect meaningful differences in explanation quality. To address this, we propose ODASim(Orderly, Dstinctive, and Absolute Similarity), a model-agnostic graded fine-tuning framework for embedding models that learns calibrated similarity representations between code and explanations. To support fine-grained supervision and evaluation, we also introduce ODA-X, a novel benchmark for code-to-explanation quality grading, comprising code–explanation pairs graded similarity labels derived from strategic perturbations of gold explanations. We apply our ODASim approach to multiple embedding models and evaluate it on two benchmarks: widely popular CodeXGLUE and our proposed benchmark ODA-X, spanning four programming languages - Python, Java, JavaScript, and Go. Results show that our method achieves up to 35% improvement in F1 score and 85% reduction in Expected Calibration Error (ECE), enabling reliable evaluation of code to explanation quality.
%U https://aclanthology.org/2026.findings-acl.1415/
%P 28390-28403
Markdown (Informal)
[ODASim: Ordered, Distinctive and Absolute Semantic Similarity for Code Explanation Evaluation](https://aclanthology.org/2026.findings-acl.1415/) (Kumar et al., Findings 2026)
ACL
- Prince Kumar, Vitobha Munigala, Jaydeep Sen, Ashish Mittal, Vishwajeet Kumar, and Srikanth G. Tamilselvam. 2026. ODASim: Ordered, Distinctive and Absolute Semantic Similarity for Code Explanation Evaluation. In Findings of the Association for Computational Linguistics: ACL 2026, pages 28390–28403, San Diego, California, United States. Association for Computational Linguistics.