@inproceedings{arslan-etal-2026-malto,
title = "{MALTO} at {S}em{E}val-2026 Task 13: Detecting Human, {AI}, and Hybrid Code via Hard Negative Mining and Curriculum-Driven Ensembles",
author = {Arslan, H{\"u}seyin and
Munis, Evren Ayberk and
Khudonogov, Timofei and
Akgun, Mert and
Besli, Murat and
Meherrem, Ayhan and
Savelli, Claudio and
Giobergia, Flavio},
editor = "Kochmar, Ekaterina and
Ghosh, Debanjan and
North, Kai and
Komachi, Mamoru",
booktitle = "Proceedings of the 20th {I}nternational {W}orkshop on {S}emantic {E}valuation (2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.semeval-1.304/",
pages = "2413--2420",
ISBN = "979-8-89176-414-9",
abstract = "The rapid advancement of Large Language Models (LLMs) has significantly impacted software engineering, posing challenges for determining the origin and authenticity of source code. This paper presents the MALTO team{'}s submission for SemEval-2026 Task 13, explicitly focusing on Subtask B (Authorship Attribution among 11 classes) and Subtask C (Hybrid Code Detection). To address severe class imbalance and the complex boundaries of mixed human-machine code, we propose a unified framework that leverages an ensemble of UniXcoder and CodeT5. Our approach integrates a robust Tree-sitter-based Universal Canonicalization strategy, Data Augmentation, and a novel 3-Phase Curriculum Training schedule enhanced by Hard Negative Mining. Specifically, UniXcoder{'}s cross-modal representations excel at distinguishing among semantically overlapping LLM families (Subtask B), whereas CodeT5{'}s identifier-aware architecture is superior at detecting subtle structural anomalies in hybrid and adversarial snippets (Subtask C). By aggregating these complementary strengths, our soft-voting ensemble overcomes the limitations of individual models, demonstrating strong robustness against imbalanced distributions and effectively discriminating between purely human, purely machine, hybrid, and adversarial code snippets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arslan-etal-2026-malto">
<titleInfo>
<title>MALTO at SemEval-2026 Task 13: Detecting Human, AI, and Hybrid Code via Hard Negative Mining and Curriculum-Driven Ensembles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hüseyin</namePart>
<namePart type="family">Arslan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evren</namePart>
<namePart type="given">Ayberk</namePart>
<namePart type="family">Munis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timofei</namePart>
<namePart type="family">Khudonogov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mert</namePart>
<namePart type="family">Akgun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Murat</namePart>
<namePart type="family">Besli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayhan</namePart>
<namePart type="family">Meherrem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudio</namePart>
<namePart type="family">Savelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Flavio</namePart>
<namePart type="family">Giobergia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 20th International Workshop on Semantic Evaluation (2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Kochmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debanjan</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">North</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mamoru</namePart>
<namePart type="family">Komachi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-414-9</identifier>
</relatedItem>
<abstract>The rapid advancement of Large Language Models (LLMs) has significantly impacted software engineering, posing challenges for determining the origin and authenticity of source code. This paper presents the MALTO team’s submission for SemEval-2026 Task 13, explicitly focusing on Subtask B (Authorship Attribution among 11 classes) and Subtask C (Hybrid Code Detection). To address severe class imbalance and the complex boundaries of mixed human-machine code, we propose a unified framework that leverages an ensemble of UniXcoder and CodeT5. Our approach integrates a robust Tree-sitter-based Universal Canonicalization strategy, Data Augmentation, and a novel 3-Phase Curriculum Training schedule enhanced by Hard Negative Mining. Specifically, UniXcoder’s cross-modal representations excel at distinguishing among semantically overlapping LLM families (Subtask B), whereas CodeT5’s identifier-aware architecture is superior at detecting subtle structural anomalies in hybrid and adversarial snippets (Subtask C). By aggregating these complementary strengths, our soft-voting ensemble overcomes the limitations of individual models, demonstrating strong robustness against imbalanced distributions and effectively discriminating between purely human, purely machine, hybrid, and adversarial code snippets.</abstract>
<identifier type="citekey">arslan-etal-2026-malto</identifier>
<location>
<url>https://aclanthology.org/2026.semeval-1.304/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>2413</start>
<end>2420</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MALTO at SemEval-2026 Task 13: Detecting Human, AI, and Hybrid Code via Hard Negative Mining and Curriculum-Driven Ensembles
%A Arslan, Hüseyin
%A Munis, Evren Ayberk
%A Khudonogov, Timofei
%A Akgun, Mert
%A Besli, Murat
%A Meherrem, Ayhan
%A Savelli, Claudio
%A Giobergia, Flavio
%Y Kochmar, Ekaterina
%Y Ghosh, Debanjan
%Y North, Kai
%Y Komachi, Mamoru
%S Proceedings of the 20th International Workshop on Semantic Evaluation (2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-414-9
%F arslan-etal-2026-malto
%X The rapid advancement of Large Language Models (LLMs) has significantly impacted software engineering, posing challenges for determining the origin and authenticity of source code. This paper presents the MALTO team’s submission for SemEval-2026 Task 13, explicitly focusing on Subtask B (Authorship Attribution among 11 classes) and Subtask C (Hybrid Code Detection). To address severe class imbalance and the complex boundaries of mixed human-machine code, we propose a unified framework that leverages an ensemble of UniXcoder and CodeT5. Our approach integrates a robust Tree-sitter-based Universal Canonicalization strategy, Data Augmentation, and a novel 3-Phase Curriculum Training schedule enhanced by Hard Negative Mining. Specifically, UniXcoder’s cross-modal representations excel at distinguishing among semantically overlapping LLM families (Subtask B), whereas CodeT5’s identifier-aware architecture is superior at detecting subtle structural anomalies in hybrid and adversarial snippets (Subtask C). By aggregating these complementary strengths, our soft-voting ensemble overcomes the limitations of individual models, demonstrating strong robustness against imbalanced distributions and effectively discriminating between purely human, purely machine, hybrid, and adversarial code snippets.
%U https://aclanthology.org/2026.semeval-1.304/
%P 2413-2420
Markdown (Informal)
[MALTO at SemEval-2026 Task 13: Detecting Human, AI, and Hybrid Code via Hard Negative Mining and Curriculum-Driven Ensembles](https://aclanthology.org/2026.semeval-1.304/) (Arslan et al., SemEval 2026)
ACL
- Hüseyin Arslan, Evren Ayberk Munis, Timofei Khudonogov, Mert Akgun, Murat Besli, Ayhan Meherrem, Claudio Savelli, and Flavio Giobergia. 2026. MALTO at SemEval-2026 Task 13: Detecting Human, AI, and Hybrid Code via Hard Negative Mining and Curriculum-Driven Ensembles. In Proceedings of the 20th International Workshop on Semantic Evaluation (2026), pages 2413–2420, San Diego, California, USA. Association for Computational Linguistics.