@inproceedings{argueta-chiang-2019-accelerating,
title = "Accelerating Sparse Matrix Operations in Neural Networks on Graphics Processing Units",
author = "Argueta, Arturo and
Chiang, David",
editor = "Korhonen, Anna and
Traum, David and
M{\`a}rquez, Llu{\'i}s",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1626/",
doi = "10.18653/v1/P19-1626",
pages = "6215--6224",
abstract = "Graphics Processing Units (GPUs) are commonly used to train and evaluate neural networks efficiently. While previous work in deep learning has focused on accelerating operations on dense matrices/tensors on GPUs, efforts have concentrated on operations involving sparse data structures. Operations using sparse structures are common in natural language models at the input and output layers, because these models operate on sequences over discrete alphabets. We present two new GPU algorithms: one at the input layer, for multiplying a matrix by a few-hot vector (generalizing the more common operation of multiplication by a one-hot vector) and one at the output layer, for a fused softmax and top-N selection (commonly used in beam search). Our methods achieve speedups over state-of-the-art parallel GPU baselines of up to 7x and 50x, respectively. We also illustrate how our methods scale on different GPU architectures."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="argueta-chiang-2019-accelerating">
<titleInfo>
<title>Accelerating Sparse Matrix Operations in Neural Networks on Graphics Processing Units</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Argueta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Korhonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Traum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Màrquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Florence, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Graphics Processing Units (GPUs) are commonly used to train and evaluate neural networks efficiently. While previous work in deep learning has focused on accelerating operations on dense matrices/tensors on GPUs, efforts have concentrated on operations involving sparse data structures. Operations using sparse structures are common in natural language models at the input and output layers, because these models operate on sequences over discrete alphabets. We present two new GPU algorithms: one at the input layer, for multiplying a matrix by a few-hot vector (generalizing the more common operation of multiplication by a one-hot vector) and one at the output layer, for a fused softmax and top-N selection (commonly used in beam search). Our methods achieve speedups over state-of-the-art parallel GPU baselines of up to 7x and 50x, respectively. We also illustrate how our methods scale on different GPU architectures.</abstract>
<identifier type="citekey">argueta-chiang-2019-accelerating</identifier>
<identifier type="doi">10.18653/v1/P19-1626</identifier>
<location>
<url>https://aclanthology.org/P19-1626/</url>
</location>
<part>
<date>2019-07</date>
<extent unit="page">
<start>6215</start>
<end>6224</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Accelerating Sparse Matrix Operations in Neural Networks on Graphics Processing Units
%A Argueta, Arturo
%A Chiang, David
%Y Korhonen, Anna
%Y Traum, David
%Y Màrquez, Lluís
%S Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics
%D 2019
%8 July
%I Association for Computational Linguistics
%C Florence, Italy
%F argueta-chiang-2019-accelerating
%X Graphics Processing Units (GPUs) are commonly used to train and evaluate neural networks efficiently. While previous work in deep learning has focused on accelerating operations on dense matrices/tensors on GPUs, efforts have concentrated on operations involving sparse data structures. Operations using sparse structures are common in natural language models at the input and output layers, because these models operate on sequences over discrete alphabets. We present two new GPU algorithms: one at the input layer, for multiplying a matrix by a few-hot vector (generalizing the more common operation of multiplication by a one-hot vector) and one at the output layer, for a fused softmax and top-N selection (commonly used in beam search). Our methods achieve speedups over state-of-the-art parallel GPU baselines of up to 7x and 50x, respectively. We also illustrate how our methods scale on different GPU architectures.
%R 10.18653/v1/P19-1626
%U https://aclanthology.org/P19-1626/
%U https://doi.org/10.18653/v1/P19-1626
%P 6215-6224
Markdown (Informal)
[Accelerating Sparse Matrix Operations in Neural Networks on Graphics Processing Units](https://aclanthology.org/P19-1626/) (Argueta & Chiang, ACL 2019)
ACL