@inproceedings{janicki-2022-optimizing,
title = "Optimizing the weighted sequence alignment algorithm for large-scale text similarity computation",
author = "Janicki, Maciej",
booktitle = "Proceedings of the 2nd International Workshop on Natural Language Processing for Digital Humanities",
month = nov,
year = "2022",
address = "Taipei, Taiwan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlp4dh-1.13",
pages = "96--100",
abstract = "We present an optimized implementation of the weighted sequence alignment algorithm (a.k.a. weighted edit distance) in a scenario where the items to align are numeric vectors and the substitution weights are determined by their cosine similarity. The optimization relies on using vector and matrix operations provided by numeric computation libraries (including GPU acceleration) instead of loops. The resulting algorithm provides an efficient way of aligning large sets of texts represented as sequences of continuous-space numeric vectors (embeddings). The optimization made it possible to compute alignment-based similarity for all pairs of texts in a large corpus of Finnic oral folk poetry for the purpose of studying intertextuality in the oral tradition.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="janicki-2022-optimizing">
<titleInfo>
<title>Optimizing the weighted sequence alignment algorithm for large-scale text similarity computation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Janicki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd International Workshop on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Taipei, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present an optimized implementation of the weighted sequence alignment algorithm (a.k.a. weighted edit distance) in a scenario where the items to align are numeric vectors and the substitution weights are determined by their cosine similarity. The optimization relies on using vector and matrix operations provided by numeric computation libraries (including GPU acceleration) instead of loops. The resulting algorithm provides an efficient way of aligning large sets of texts represented as sequences of continuous-space numeric vectors (embeddings). The optimization made it possible to compute alignment-based similarity for all pairs of texts in a large corpus of Finnic oral folk poetry for the purpose of studying intertextuality in the oral tradition.</abstract>
<identifier type="citekey">janicki-2022-optimizing</identifier>
<location>
<url>https://aclanthology.org/2022.nlp4dh-1.13</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>96</start>
<end>100</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Optimizing the weighted sequence alignment algorithm for large-scale text similarity computation
%A Janicki, Maciej
%S Proceedings of the 2nd International Workshop on Natural Language Processing for Digital Humanities
%D 2022
%8 November
%I Association for Computational Linguistics
%C Taipei, Taiwan
%F janicki-2022-optimizing
%X We present an optimized implementation of the weighted sequence alignment algorithm (a.k.a. weighted edit distance) in a scenario where the items to align are numeric vectors and the substitution weights are determined by their cosine similarity. The optimization relies on using vector and matrix operations provided by numeric computation libraries (including GPU acceleration) instead of loops. The resulting algorithm provides an efficient way of aligning large sets of texts represented as sequences of continuous-space numeric vectors (embeddings). The optimization made it possible to compute alignment-based similarity for all pairs of texts in a large corpus of Finnic oral folk poetry for the purpose of studying intertextuality in the oral tradition.
%U https://aclanthology.org/2022.nlp4dh-1.13
%P 96-100
Markdown (Informal)
[Optimizing the weighted sequence alignment algorithm for large-scale text similarity computation](https://aclanthology.org/2022.nlp4dh-1.13) (Janicki, NLP4DH 2022)
ACL