@inproceedings{miceli-barone-sennrich-2017-parallel,
title = "A Parallel Corpus of Python Functions and Documentation Strings for Automated Code Documentation and Code Generation",
author = "Miceli Barone, Antonio Valerio and
Sennrich, Rico",
editor = "Kondrak, Greg and
Watanabe, Taro",
booktitle = "Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = nov,
year = "2017",
address = "Taipei, Taiwan",
publisher = "Asian Federation of Natural Language Processing",
url = "https://aclanthology.org/I17-2053",
pages = "314--319",
abstract = "Automated documentation of programming source code and automated code generation from natural language are challenging tasks of both practical and scientific interest. Progress in these areas has been limited by the low availability of parallel corpora of code and natural language descriptions, which tend to be small and constrained to specific domains. In this work we introduce a large and diverse parallel corpus of a hundred thousands Python functions with their documentation strings ({``}docstrings{''}) generated by scraping open source repositories on GitHub. We describe baseline results for the code documentation and code generation tasks obtained by neural machine translation. We also experiment with data augmentation techniques to further increase the amount of training data. We release our datasets and processing scripts in order to stimulate research in these areas.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="miceli-barone-sennrich-2017-parallel">
<titleInfo>
<title>A Parallel Corpus of Python Functions and Documentation Strings for Automated Code Documentation and Code Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antonio</namePart>
<namePart type="given">Valerio</namePart>
<namePart type="family">Miceli Barone</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rico</namePart>
<namePart type="family">Sennrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Kondrak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taro</namePart>
<namePart type="family">Watanabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Asian Federation of Natural Language Processing</publisher>
<place>
<placeTerm type="text">Taipei, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automated documentation of programming source code and automated code generation from natural language are challenging tasks of both practical and scientific interest. Progress in these areas has been limited by the low availability of parallel corpora of code and natural language descriptions, which tend to be small and constrained to specific domains. In this work we introduce a large and diverse parallel corpus of a hundred thousands Python functions with their documentation strings (“docstrings”) generated by scraping open source repositories on GitHub. We describe baseline results for the code documentation and code generation tasks obtained by neural machine translation. We also experiment with data augmentation techniques to further increase the amount of training data. We release our datasets and processing scripts in order to stimulate research in these areas.</abstract>
<identifier type="citekey">miceli-barone-sennrich-2017-parallel</identifier>
<location>
<url>https://aclanthology.org/I17-2053</url>
</location>
<part>
<date>2017-11</date>
<extent unit="page">
<start>314</start>
<end>319</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Parallel Corpus of Python Functions and Documentation Strings for Automated Code Documentation and Code Generation
%A Miceli Barone, Antonio Valerio
%A Sennrich, Rico
%Y Kondrak, Greg
%Y Watanabe, Taro
%S Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2017
%8 November
%I Asian Federation of Natural Language Processing
%C Taipei, Taiwan
%F miceli-barone-sennrich-2017-parallel
%X Automated documentation of programming source code and automated code generation from natural language are challenging tasks of both practical and scientific interest. Progress in these areas has been limited by the low availability of parallel corpora of code and natural language descriptions, which tend to be small and constrained to specific domains. In this work we introduce a large and diverse parallel corpus of a hundred thousands Python functions with their documentation strings (“docstrings”) generated by scraping open source repositories on GitHub. We describe baseline results for the code documentation and code generation tasks obtained by neural machine translation. We also experiment with data augmentation techniques to further increase the amount of training data. We release our datasets and processing scripts in order to stimulate research in these areas.
%U https://aclanthology.org/I17-2053
%P 314-319
Markdown (Informal)
[A Parallel Corpus of Python Functions and Documentation Strings for Automated Code Documentation and Code Generation](https://aclanthology.org/I17-2053) (Miceli Barone & Sennrich, IJCNLP 2017)
ACL