@inproceedings{mahmud-etal-2021-code,
title = "Code to Comment Translation: A Comparative Study on Model Effectiveness {\&} Errors",
author = "Mahmud, Junayed and
Faisal, Fahim and
Arnob, Raihan Islam and
Anastasopoulos, Antonios and
Moran, Kevin",
editor = "Lachmy, Royi and
Yao, Ziyu and
Durrett, Greg and
Gligoric, Milos and
Li, Junyi Jessy and
Mooney, Ray and
Neubig, Graham and
Su, Yu and
Sun, Huan and
Tsarfaty, Reut",
booktitle = "Proceedings of the 1st Workshop on Natural Language Processing for Programming (NLP4Prog 2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.nlp4prog-1.1/",
doi = "10.18653/v1/2021.nlp4prog-1.1",
pages = "1--16",
abstract = "Automated source code summarization is a popular software engineering research topic wherein machine translation models are employed to {\textquotedblleft}translate{\textquotedblright} code snippets into relevant natural language descriptions. Most evaluations of such models are conducted using automatic reference-based metrics. However, given the relatively large semantic gap between programming languages and natural language, we argue that this line of research would benefit from a qualitative investigation into the various error modes of current state-of-the-art models. Therefore, in this work, we perform both a quantitative and qualitative comparison of three recently proposed source code summarization models. In our quantitative evaluation, we compare the models based on the smoothed BLEU-4, METEOR, and ROUGE-L machine translation metrics, and in our qualitative evaluation, we perform a manual open-coding of the most common errors committed by the models when compared to ground truth captions. Our investigation reveals new insights into the relationship between metric-based performance and model prediction errors grounded in an error taxonomy that can be used to drive future research efforts."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mahmud-etal-2021-code">
<titleInfo>
<title>Code to Comment Translation: A Comparative Study on Model Effectiveness & Errors</title>
</titleInfo>
<name type="personal">
<namePart type="given">Junayed</namePart>
<namePart type="family">Mahmud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fahim</namePart>
<namePart type="family">Faisal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raihan</namePart>
<namePart type="given">Islam</namePart>
<namePart type="family">Arnob</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Moran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Natural Language Processing for Programming (NLP4Prog 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Royi</namePart>
<namePart type="family">Lachmy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyu</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Greg</namePart>
<namePart type="family">Durrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Milos</namePart>
<namePart type="family">Gligoric</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyi</namePart>
<namePart type="given">Jessy</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ray</namePart>
<namePart type="family">Mooney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Graham</namePart>
<namePart type="family">Neubig</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huan</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reut</namePart>
<namePart type="family">Tsarfaty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automated source code summarization is a popular software engineering research topic wherein machine translation models are employed to “translate” code snippets into relevant natural language descriptions. Most evaluations of such models are conducted using automatic reference-based metrics. However, given the relatively large semantic gap between programming languages and natural language, we argue that this line of research would benefit from a qualitative investigation into the various error modes of current state-of-the-art models. Therefore, in this work, we perform both a quantitative and qualitative comparison of three recently proposed source code summarization models. In our quantitative evaluation, we compare the models based on the smoothed BLEU-4, METEOR, and ROUGE-L machine translation metrics, and in our qualitative evaluation, we perform a manual open-coding of the most common errors committed by the models when compared to ground truth captions. Our investigation reveals new insights into the relationship between metric-based performance and model prediction errors grounded in an error taxonomy that can be used to drive future research efforts.</abstract>
<identifier type="citekey">mahmud-etal-2021-code</identifier>
<identifier type="doi">10.18653/v1/2021.nlp4prog-1.1</identifier>
<location>
<url>https://aclanthology.org/2021.nlp4prog-1.1/</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>1</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Code to Comment Translation: A Comparative Study on Model Effectiveness & Errors
%A Mahmud, Junayed
%A Faisal, Fahim
%A Arnob, Raihan Islam
%A Anastasopoulos, Antonios
%A Moran, Kevin
%Y Lachmy, Royi
%Y Yao, Ziyu
%Y Durrett, Greg
%Y Gligoric, Milos
%Y Li, Junyi Jessy
%Y Mooney, Ray
%Y Neubig, Graham
%Y Su, Yu
%Y Sun, Huan
%Y Tsarfaty, Reut
%S Proceedings of the 1st Workshop on Natural Language Processing for Programming (NLP4Prog 2021)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F mahmud-etal-2021-code
%X Automated source code summarization is a popular software engineering research topic wherein machine translation models are employed to “translate” code snippets into relevant natural language descriptions. Most evaluations of such models are conducted using automatic reference-based metrics. However, given the relatively large semantic gap between programming languages and natural language, we argue that this line of research would benefit from a qualitative investigation into the various error modes of current state-of-the-art models. Therefore, in this work, we perform both a quantitative and qualitative comparison of three recently proposed source code summarization models. In our quantitative evaluation, we compare the models based on the smoothed BLEU-4, METEOR, and ROUGE-L machine translation metrics, and in our qualitative evaluation, we perform a manual open-coding of the most common errors committed by the models when compared to ground truth captions. Our investigation reveals new insights into the relationship between metric-based performance and model prediction errors grounded in an error taxonomy that can be used to drive future research efforts.
%R 10.18653/v1/2021.nlp4prog-1.1
%U https://aclanthology.org/2021.nlp4prog-1.1/
%U https://doi.org/10.18653/v1/2021.nlp4prog-1.1
%P 1-16
Markdown (Informal)
[Code to Comment Translation: A Comparative Study on Model Effectiveness & Errors](https://aclanthology.org/2021.nlp4prog-1.1/) (Mahmud et al., NLP4Prog 2021)
ACL