@inproceedings{sorokin-etal-2025-cct,
title = "{CCT}-Code: Cross-Consistency Training for Multilingual Clone Detection and Code Search",
author = "Sorokin, Nikita and
Anton, Tikhonov and
Abulkhanov, Dmitry and
Sedykh, Ivan and
Piontkovskaya, Irina and
Malykh, Valentin",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-srw.17/",
doi = "10.18653/v1/2025.naacl-srw.17",
pages = "178--185",
ISBN = "979-8-89176-192-6",
abstract = "We consider the well-known and important tasks of clone detection and information retrieval for source code. The most standard setup is to search clones inside the same language code snippets. But it is also useful to find code snippets with identical behaviour in different programming languages. Nevertheless multi- and cross-lingual clone detection has been little studied in literature. We present a novel training procedure, cross-consistency training (CCT) leveraging cross-lingual similarity, that we apply to train language models on source code in various programming languages. We show that this training is effective both for encoder- and decoder-based models.The trained encoder-based CCT-LM model{\%}and fine-tuned with CCT,achieves a new state of the art on POJ-104 (monolingual C++ clone detection benchmark) with 96.73{\%} MAP and AdvTest (monolingual Python code search benchmark) with 47.18{\%} MRR. The decoder-based CCT-LM model shows comparable performance in these tasks. In addition, we formulate the multi- and cross-lingual clone detection problem and present XCD, a new benchmark dataset produced from CodeForces submissions."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sorokin-etal-2025-cct">
<titleInfo>
<title>CCT-Code: Cross-Consistency Training for Multilingual Clone Detection and Code Search</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Sorokin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tikhonov</namePart>
<namePart type="family">Anton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmitry</namePart>
<namePart type="family">Abulkhanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Sedykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Piontkovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samar</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sammar</namePart>
<namePart type="family">Haider</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Leonor Pacheco</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shira</namePart>
<namePart type="family">Wein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-192-6</identifier>
</relatedItem>
<abstract>We consider the well-known and important tasks of clone detection and information retrieval for source code. The most standard setup is to search clones inside the same language code snippets. But it is also useful to find code snippets with identical behaviour in different programming languages. Nevertheless multi- and cross-lingual clone detection has been little studied in literature. We present a novel training procedure, cross-consistency training (CCT) leveraging cross-lingual similarity, that we apply to train language models on source code in various programming languages. We show that this training is effective both for encoder- and decoder-based models.The trained encoder-based CCT-LM model%and fine-tuned with CCT,achieves a new state of the art on POJ-104 (monolingual C++ clone detection benchmark) with 96.73% MAP and AdvTest (monolingual Python code search benchmark) with 47.18% MRR. The decoder-based CCT-LM model shows comparable performance in these tasks. In addition, we formulate the multi- and cross-lingual clone detection problem and present XCD, a new benchmark dataset produced from CodeForces submissions.</abstract>
<identifier type="citekey">sorokin-etal-2025-cct</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-srw.17</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-srw.17/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>178</start>
<end>185</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CCT-Code: Cross-Consistency Training for Multilingual Clone Detection and Code Search
%A Sorokin, Nikita
%A Anton, Tikhonov
%A Abulkhanov, Dmitry
%A Sedykh, Ivan
%A Piontkovskaya, Irina
%A Malykh, Valentin
%Y Ebrahimi, Abteen
%Y Haider, Samar
%Y Liu, Emmy
%Y Haider, Sammar
%Y Leonor Pacheco, Maria
%Y Wein, Shira
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, USA
%@ 979-8-89176-192-6
%F sorokin-etal-2025-cct
%X We consider the well-known and important tasks of clone detection and information retrieval for source code. The most standard setup is to search clones inside the same language code snippets. But it is also useful to find code snippets with identical behaviour in different programming languages. Nevertheless multi- and cross-lingual clone detection has been little studied in literature. We present a novel training procedure, cross-consistency training (CCT) leveraging cross-lingual similarity, that we apply to train language models on source code in various programming languages. We show that this training is effective both for encoder- and decoder-based models.The trained encoder-based CCT-LM model%and fine-tuned with CCT,achieves a new state of the art on POJ-104 (monolingual C++ clone detection benchmark) with 96.73% MAP and AdvTest (monolingual Python code search benchmark) with 47.18% MRR. The decoder-based CCT-LM model shows comparable performance in these tasks. In addition, we formulate the multi- and cross-lingual clone detection problem and present XCD, a new benchmark dataset produced from CodeForces submissions.
%R 10.18653/v1/2025.naacl-srw.17
%U https://aclanthology.org/2025.naacl-srw.17/
%U https://doi.org/10.18653/v1/2025.naacl-srw.17
%P 178-185
Markdown (Informal)
[CCT-Code: Cross-Consistency Training for Multilingual Clone Detection and Code Search](https://aclanthology.org/2025.naacl-srw.17/) (Sorokin et al., NAACL 2025)
ACL
- Nikita Sorokin, Tikhonov Anton, Dmitry Abulkhanov, Ivan Sedykh, Irina Piontkovskaya, and Valentin Malykh. 2025. CCT-Code: Cross-Consistency Training for Multilingual Clone Detection and Code Search. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 178–185, Albuquerque, USA. Association for Computational Linguistics.