@inproceedings{gain-etal-2025-transforming,
title = "Transforming Code Understanding: Clustering-Based Retrieval for Improved Summarization in Domain-Specific Languages",
author = "Gain, Baban and
Bandyopadhyay, Dibyanayan and
Mukherjee, Samrat and
Sahoo, Aryan and
Dana, Saswati and
Kodeswaran, Palanivel and
Sen, Sayandeep and
Ekbal, Asif and
Garg, Dinesh",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Darwish, Kareem and
Agarwal, Apoorv",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: Industry Track",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-industry.47/",
pages = "546--560",
abstract = "A domain-specific extension of C language known as \textit{extended Berkeley Packet Filter (eBPF)} has gained widespread acceptance for various tasks, including observability, security, and network acceleration in the cloud community. Due to its recency and complexity, there is an overwhelming need for natural language summaries of existing eBPF codes (particularly open-source code) for practitioners and developers, which will go a long way in easing the understanding and development of new code. However, being a niche Domain-Specific Language (DSL), there is a scarcity of available training data. In this paper, we investigate the effectiveness of LLMs for summarizing low-resource DSLs, in the context of eBPF codes. Specifically, we propose a clustering-based technique to retrieve in-context examples that are semantically closer to the test example and propose a very simple yet powerful prompt design that yields superior-quality code summary generation. Experimental results show that our proposed retrieval approach for prompt generation improves the eBPF code summarization accuracy up to 12.9 BLEU points over other prompting techniques. The codes are available at https://github.com/babangain/ebpf{\_}summ."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gain-etal-2025-transforming">
<titleInfo>
<title>Transforming Code Understanding: Clustering-Based Retrieval for Improved Summarization in Domain-Specific Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Baban</namePart>
<namePart type="family">Gain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dibyanayan</namePart>
<namePart type="family">Bandyopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samrat</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aryan</namePart>
<namePart type="family">Sahoo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saswati</namePart>
<namePart type="family">Dana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Palanivel</namePart>
<namePart type="family">Kodeswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sayandeep</namePart>
<namePart type="family">Sen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dinesh</namePart>
<namePart type="family">Garg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apoorv</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A domain-specific extension of C language known as extended Berkeley Packet Filter (eBPF) has gained widespread acceptance for various tasks, including observability, security, and network acceleration in the cloud community. Due to its recency and complexity, there is an overwhelming need for natural language summaries of existing eBPF codes (particularly open-source code) for practitioners and developers, which will go a long way in easing the understanding and development of new code. However, being a niche Domain-Specific Language (DSL), there is a scarcity of available training data. In this paper, we investigate the effectiveness of LLMs for summarizing low-resource DSLs, in the context of eBPF codes. Specifically, we propose a clustering-based technique to retrieve in-context examples that are semantically closer to the test example and propose a very simple yet powerful prompt design that yields superior-quality code summary generation. Experimental results show that our proposed retrieval approach for prompt generation improves the eBPF code summarization accuracy up to 12.9 BLEU points over other prompting techniques. The codes are available at https://github.com/babangain/ebpf_summ.</abstract>
<identifier type="citekey">gain-etal-2025-transforming</identifier>
<location>
<url>https://aclanthology.org/2025.coling-industry.47/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>546</start>
<end>560</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transforming Code Understanding: Clustering-Based Retrieval for Improved Summarization in Domain-Specific Languages
%A Gain, Baban
%A Bandyopadhyay, Dibyanayan
%A Mukherjee, Samrat
%A Sahoo, Aryan
%A Dana, Saswati
%A Kodeswaran, Palanivel
%A Sen, Sayandeep
%A Ekbal, Asif
%A Garg, Dinesh
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%Y Darwish, Kareem
%Y Agarwal, Apoorv
%S Proceedings of the 31st International Conference on Computational Linguistics: Industry Track
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F gain-etal-2025-transforming
%X A domain-specific extension of C language known as extended Berkeley Packet Filter (eBPF) has gained widespread acceptance for various tasks, including observability, security, and network acceleration in the cloud community. Due to its recency and complexity, there is an overwhelming need for natural language summaries of existing eBPF codes (particularly open-source code) for practitioners and developers, which will go a long way in easing the understanding and development of new code. However, being a niche Domain-Specific Language (DSL), there is a scarcity of available training data. In this paper, we investigate the effectiveness of LLMs for summarizing low-resource DSLs, in the context of eBPF codes. Specifically, we propose a clustering-based technique to retrieve in-context examples that are semantically closer to the test example and propose a very simple yet powerful prompt design that yields superior-quality code summary generation. Experimental results show that our proposed retrieval approach for prompt generation improves the eBPF code summarization accuracy up to 12.9 BLEU points over other prompting techniques. The codes are available at https://github.com/babangain/ebpf_summ.
%U https://aclanthology.org/2025.coling-industry.47/
%P 546-560
Markdown (Informal)
[Transforming Code Understanding: Clustering-Based Retrieval for Improved Summarization in Domain-Specific Languages](https://aclanthology.org/2025.coling-industry.47/) (Gain et al., COLING 2025)
ACL
- Baban Gain, Dibyanayan Bandyopadhyay, Samrat Mukherjee, Aryan Sahoo, Saswati Dana, Palanivel Kodeswaran, Sayandeep Sen, Asif Ekbal, and Dinesh Garg. 2025. Transforming Code Understanding: Clustering-Based Retrieval for Improved Summarization in Domain-Specific Languages. In Proceedings of the 31st International Conference on Computational Linguistics: Industry Track, pages 546–560, Abu Dhabi, UAE. Association for Computational Linguistics.