@inproceedings{peck-brown-2024-tool,
title = "Tool for Constructing a Large-Scale Corpus of Code Comments and Other Source Code Annotations",
author = "Peck, Luna and
Brown, Susan",
editor = "Gorman, Kyle and
Prud'hommeaux, Emily and
Roark, Brian and
Sproat, Richard",
booktitle = "Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.cawl-1.3",
pages = "18--22",
abstract = "The sublanguage of source code annotations{---}explanatory natural language writing that accompanies programming source code{---}is little-studied in linguistics. To facilitate research into this domain, we have developed a program prototype that can extract code comments and changelogs (i.e. commit messages) from public, open-source code repositories, with automatic tokenization and part-of-speech tagging on the extracted text. The program can also automatically detect and discard {``}commented-out{''} source code in data from Python repositories, to prevent it from polluting the corpus, demonstrating that such sanitization is likely feasible for other programming languages as well. With the current tool, we have produced a 6-million word corpus of English-language comments extracted from three different programming languages: Python, C, and C++.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="peck-brown-2024-tool">
<titleInfo>
<title>Tool for Constructing a Large-Scale Corpus of Code Comments and Other Source Code Annotations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luna</namePart>
<namePart type="family">Peck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Susan</namePart>
<namePart type="family">Brown</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Gorman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Prud’hommeaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Sproat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The sublanguage of source code annotations—explanatory natural language writing that accompanies programming source code—is little-studied in linguistics. To facilitate research into this domain, we have developed a program prototype that can extract code comments and changelogs (i.e. commit messages) from public, open-source code repositories, with automatic tokenization and part-of-speech tagging on the extracted text. The program can also automatically detect and discard “commented-out” source code in data from Python repositories, to prevent it from polluting the corpus, demonstrating that such sanitization is likely feasible for other programming languages as well. With the current tool, we have produced a 6-million word corpus of English-language comments extracted from three different programming languages: Python, C, and C++.</abstract>
<identifier type="citekey">peck-brown-2024-tool</identifier>
<location>
<url>https://aclanthology.org/2024.cawl-1.3</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>18</start>
<end>22</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tool for Constructing a Large-Scale Corpus of Code Comments and Other Source Code Annotations
%A Peck, Luna
%A Brown, Susan
%Y Gorman, Kyle
%Y Prud’hommeaux, Emily
%Y Roark, Brian
%Y Sproat, Richard
%S Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F peck-brown-2024-tool
%X The sublanguage of source code annotations—explanatory natural language writing that accompanies programming source code—is little-studied in linguistics. To facilitate research into this domain, we have developed a program prototype that can extract code comments and changelogs (i.e. commit messages) from public, open-source code repositories, with automatic tokenization and part-of-speech tagging on the extracted text. The program can also automatically detect and discard “commented-out” source code in data from Python repositories, to prevent it from polluting the corpus, demonstrating that such sanitization is likely feasible for other programming languages as well. With the current tool, we have produced a 6-million word corpus of English-language comments extracted from three different programming languages: Python, C, and C++.
%U https://aclanthology.org/2024.cawl-1.3
%P 18-22
Markdown (Informal)
[Tool for Constructing a Large-Scale Corpus of Code Comments and Other Source Code Annotations](https://aclanthology.org/2024.cawl-1.3) (Peck & Brown, CAWL-WS 2024)
ACL