@inproceedings{brunila-laviolette-2021-wmdecompose,
title = "{WMD}ecompose: A Framework for Leveraging the Interpretable Properties of Word Mover{'}s Distance in Sociocultural Analysis",
author = "Brunila, Mikael and
LaViolette, Jack",
editor = "Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic (online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.latechclfl-1.18",
doi = "10.18653/v1/2021.latechclfl-1.18",
pages = "154--167",
abstract = "Despite the increasing popularity of NLP in the humanities and social sciences, advances in model performance and complexity have been accompanied by concerns about interpretability and explanatory power for sociocultural analysis. One popular model that takes a middle road is Word Mover{'}s Distance (WMD). Ostensibly adapted for its interpretability, WMD has nonetheless been used and further developed in ways which frequently discard its most interpretable aspect: namely, the word-level distances required for translating a set of words into another set of words. To address this apparent gap, we introduce WMDecompose: a model and Python library that 1) decomposes document-level distances into their constituent word-level distances, and 2) subsequently clusters words to induce thematic elements, such that useful lexical information is retained and summarized for analysis. To illustrate its potential in a social scientific context, we apply it to a longitudinal social media corpus to explore the interrelationship between conspiracy theories and conservative American discourses. Finally, because of the full WMD model{'}s high time-complexity, we additionally suggest a method of sampling document pairs from large datasets in a reproducible way, with tight bounds that prevent extrapolation of unreliable results due to poor sampling practices.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="brunila-laviolette-2021-wmdecompose">
<titleInfo>
<title>WMDecompose: A Framework for Leveraging the Interpretable Properties of Word Mover’s Distance in Sociocultural Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mikael</namePart>
<namePart type="family">Brunila</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">LaViolette</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic (online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Despite the increasing popularity of NLP in the humanities and social sciences, advances in model performance and complexity have been accompanied by concerns about interpretability and explanatory power for sociocultural analysis. One popular model that takes a middle road is Word Mover’s Distance (WMD). Ostensibly adapted for its interpretability, WMD has nonetheless been used and further developed in ways which frequently discard its most interpretable aspect: namely, the word-level distances required for translating a set of words into another set of words. To address this apparent gap, we introduce WMDecompose: a model and Python library that 1) decomposes document-level distances into their constituent word-level distances, and 2) subsequently clusters words to induce thematic elements, such that useful lexical information is retained and summarized for analysis. To illustrate its potential in a social scientific context, we apply it to a longitudinal social media corpus to explore the interrelationship between conspiracy theories and conservative American discourses. Finally, because of the full WMD model’s high time-complexity, we additionally suggest a method of sampling document pairs from large datasets in a reproducible way, with tight bounds that prevent extrapolation of unreliable results due to poor sampling practices.</abstract>
<identifier type="citekey">brunila-laviolette-2021-wmdecompose</identifier>
<identifier type="doi">10.18653/v1/2021.latechclfl-1.18</identifier>
<location>
<url>https://aclanthology.org/2021.latechclfl-1.18</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>154</start>
<end>167</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T WMDecompose: A Framework for Leveraging the Interpretable Properties of Word Mover’s Distance in Sociocultural Analysis
%A Brunila, Mikael
%A LaViolette, Jack
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Reiter, Nils
%Y Szpakowicz, Stan
%S Proceedings of the 5th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic (online)
%F brunila-laviolette-2021-wmdecompose
%X Despite the increasing popularity of NLP in the humanities and social sciences, advances in model performance and complexity have been accompanied by concerns about interpretability and explanatory power for sociocultural analysis. One popular model that takes a middle road is Word Mover’s Distance (WMD). Ostensibly adapted for its interpretability, WMD has nonetheless been used and further developed in ways which frequently discard its most interpretable aspect: namely, the word-level distances required for translating a set of words into another set of words. To address this apparent gap, we introduce WMDecompose: a model and Python library that 1) decomposes document-level distances into their constituent word-level distances, and 2) subsequently clusters words to induce thematic elements, such that useful lexical information is retained and summarized for analysis. To illustrate its potential in a social scientific context, we apply it to a longitudinal social media corpus to explore the interrelationship between conspiracy theories and conservative American discourses. Finally, because of the full WMD model’s high time-complexity, we additionally suggest a method of sampling document pairs from large datasets in a reproducible way, with tight bounds that prevent extrapolation of unreliable results due to poor sampling practices.
%R 10.18653/v1/2021.latechclfl-1.18
%U https://aclanthology.org/2021.latechclfl-1.18
%U https://doi.org/10.18653/v1/2021.latechclfl-1.18
%P 154-167
Markdown (Informal)
[WMDecompose: A Framework for Leveraging the Interpretable Properties of Word Mover’s Distance in Sociocultural Analysis](https://aclanthology.org/2021.latechclfl-1.18) (Brunila & LaViolette, LaTeCHCLfL 2021)
ACL