@inproceedings{li-2024-tracing,
title = "Tracing the Genealogies of Ideas with Sentence Embeddings",
author = "Li, Lucian",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Miyagawa, So and
Alnajjar, Khalid and
Bizzoni, Yuri},
booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
month = nov,
year = "2024",
address = "Miami, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.nlp4dh-1.2",
pages = "9--16",
abstract = "Detecting intellectual influence in unstructured text is an important problem for a wide range of fields, including intellectual history, social science, and bibliometrics. A wide range of previous studies in computational social science and digital humanities have attempted to resolve this through a range of dictionary, embedding, and language model based methods. I introduce an approach which leverages a sentence embedding index to efficiently search for similar ideas in a large historical corpus. This method remains robust in conditions of high OCR error found in real mass digitized historical corpora that disrupt previous published methods, while also capturing paraphrase and indirect influence. I evaluate this method on a large corpus of 250,000 nonfiction texts from the 19th century, and find that discovered influence is in line with history of science literature. By expanding the scope of our search for influence and the origins of ideas beyond traditional structured corpora and canonical works and figures, we can get a more nuanced perspective on influence and idea dissemination that can encompass epistemically marginalized groups.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-2024-tracing">
<titleInfo>
<title>Tracing the Genealogies of Ideas with Sentence Embeddings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">So</namePart>
<namePart type="family">Miyagawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Detecting intellectual influence in unstructured text is an important problem for a wide range of fields, including intellectual history, social science, and bibliometrics. A wide range of previous studies in computational social science and digital humanities have attempted to resolve this through a range of dictionary, embedding, and language model based methods. I introduce an approach which leverages a sentence embedding index to efficiently search for similar ideas in a large historical corpus. This method remains robust in conditions of high OCR error found in real mass digitized historical corpora that disrupt previous published methods, while also capturing paraphrase and indirect influence. I evaluate this method on a large corpus of 250,000 nonfiction texts from the 19th century, and find that discovered influence is in line with history of science literature. By expanding the scope of our search for influence and the origins of ideas beyond traditional structured corpora and canonical works and figures, we can get a more nuanced perspective on influence and idea dissemination that can encompass epistemically marginalized groups.</abstract>
<identifier type="citekey">li-2024-tracing</identifier>
<location>
<url>https://aclanthology.org/2024.nlp4dh-1.2</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>9</start>
<end>16</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tracing the Genealogies of Ideas with Sentence Embeddings
%A Li, Lucian
%Y Hämäläinen, Mika
%Y Öhman, Emily
%Y Miyagawa, So
%Y Alnajjar, Khalid
%Y Bizzoni, Yuri
%S Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, USA
%F li-2024-tracing
%X Detecting intellectual influence in unstructured text is an important problem for a wide range of fields, including intellectual history, social science, and bibliometrics. A wide range of previous studies in computational social science and digital humanities have attempted to resolve this through a range of dictionary, embedding, and language model based methods. I introduce an approach which leverages a sentence embedding index to efficiently search for similar ideas in a large historical corpus. This method remains robust in conditions of high OCR error found in real mass digitized historical corpora that disrupt previous published methods, while also capturing paraphrase and indirect influence. I evaluate this method on a large corpus of 250,000 nonfiction texts from the 19th century, and find that discovered influence is in line with history of science literature. By expanding the scope of our search for influence and the origins of ideas beyond traditional structured corpora and canonical works and figures, we can get a more nuanced perspective on influence and idea dissemination that can encompass epistemically marginalized groups.
%U https://aclanthology.org/2024.nlp4dh-1.2
%P 9-16
Markdown (Informal)
[Tracing the Genealogies of Ideas with Sentence Embeddings](https://aclanthology.org/2024.nlp4dh-1.2) (Li, NLP4DH 2024)
ACL