@inproceedings{ma-nyarko-2025-identifying,
title = "Identifying Emerging Concepts in Large Corpora",
author = "Ma, Sibo and
Nyarko, Julian",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.344/",
doi = "10.18653/v1/2025.naacl-long.344",
pages = "6760--6778",
ISBN = "979-8-89176-189-6",
abstract = "We introduce a new method to identify emerging concepts in large text corpora. By analyzing changes in the heatmaps of the underlying embedding space, we are able to detect these concepts with high accuracy shortly after they originate, in turn outperforming common alternatives. We further demonstrate the utility of our approach by analyzing speeches in the U.S. Senate from 1941 to 2015. Our results suggest that the minority party is more active in introducing new concepts into the Senate discourse. We also identify specific concepts that closely correlate with the Senators' racial, ethnic, and gender identities. An implementation of our method is publicly available."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ma-nyarko-2025-identifying">
<titleInfo>
<title>Identifying Emerging Concepts in Large Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sibo</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julian</namePart>
<namePart type="family">Nyarko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>We introduce a new method to identify emerging concepts in large text corpora. By analyzing changes in the heatmaps of the underlying embedding space, we are able to detect these concepts with high accuracy shortly after they originate, in turn outperforming common alternatives. We further demonstrate the utility of our approach by analyzing speeches in the U.S. Senate from 1941 to 2015. Our results suggest that the minority party is more active in introducing new concepts into the Senate discourse. We also identify specific concepts that closely correlate with the Senators’ racial, ethnic, and gender identities. An implementation of our method is publicly available.</abstract>
<identifier type="citekey">ma-nyarko-2025-identifying</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.344</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.344/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>6760</start>
<end>6778</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Identifying Emerging Concepts in Large Corpora
%A Ma, Sibo
%A Nyarko, Julian
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F ma-nyarko-2025-identifying
%X We introduce a new method to identify emerging concepts in large text corpora. By analyzing changes in the heatmaps of the underlying embedding space, we are able to detect these concepts with high accuracy shortly after they originate, in turn outperforming common alternatives. We further demonstrate the utility of our approach by analyzing speeches in the U.S. Senate from 1941 to 2015. Our results suggest that the minority party is more active in introducing new concepts into the Senate discourse. We also identify specific concepts that closely correlate with the Senators’ racial, ethnic, and gender identities. An implementation of our method is publicly available.
%R 10.18653/v1/2025.naacl-long.344
%U https://aclanthology.org/2025.naacl-long.344/
%U https://doi.org/10.18653/v1/2025.naacl-long.344
%P 6760-6778
Markdown (Informal)
[Identifying Emerging Concepts in Large Corpora](https://aclanthology.org/2025.naacl-long.344/) (Ma & Nyarko, NAACL 2025)
ACL
- Sibo Ma and Julian Nyarko. 2025. Identifying Emerging Concepts in Large Corpora. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 6760–6778, Albuquerque, New Mexico. Association for Computational Linguistics.