@inproceedings{mansour-etal-2025-well,
title = "How Well Do Large Language Models Extract Keywords? A Systematic Evaluation on Scientific Corpora",
author = "Mansour, Nacef Ben and
Rahimi, Hamed and
Alrahabi, Motasem",
editor = "Jansen, Peter and
Dalvi Mishra, Bhavana and
Trivedi, Harsh and
Prasad Majumder, Bodhisattwa and
Hope, Tom and
Khot, Tushar and
Downey, Doug and
Horvitz, Eric",
booktitle = "Proceedings of the 1st Workshop on AI and Scientific Discovery: Directions and Opportunities",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.aisd-main.2/",
doi = "10.18653/v1/2025.aisd-main.2",
pages = "13--21",
ISBN = "979-8-89176-224-4",
abstract = "Automatic keyword extraction from scientific articles is pivotal for organizing scholarly archives, powering semantic search engines, and mapping interdisciplinary research trends. However, existing methods{---}including statistical and graph-based approaches{---}struggle to handle domain-specific challenges such as technical terminology, cross-disciplinary ambiguity, and dynamic scientific jargon. This paper presents an empirical comparison of traditional keyword extraction methods (e.g. TextRank and YAKE) with approaches based on Large Language Model. We introduce a novel evaluation framework that combines fuzzy semantic matching based on Levenshtein Distance with exact-match metrics (F1, precision, recall) to address inconsistencies in keyword normalization across scientific corpora. Through an extensive ablation study across nine different LLMs, we analyze their performance and associated costs. Our findings reveal that LLM-based methods consistently achieve superior precision and relevance compared to traditional approaches. This performance advantage suggests significant potential for improving scientific search systems and information retrieval in academic contexts."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mansour-etal-2025-well">
<titleInfo>
<title>How Well Do Large Language Models Extract Keywords? A Systematic Evaluation on Scientific Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nacef</namePart>
<namePart type="given">Ben</namePart>
<namePart type="family">Mansour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamed</namePart>
<namePart type="family">Rahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Motasem</namePart>
<namePart type="family">Alrahabi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on AI and Scientific Discovery: Directions and Opportunities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Jansen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhavana</namePart>
<namePart type="family">Dalvi Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Harsh</namePart>
<namePart type="family">Trivedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bodhisattwa</namePart>
<namePart type="family">Prasad Majumder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Hope</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tushar</namePart>
<namePart type="family">Khot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Doug</namePart>
<namePart type="family">Downey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Horvitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-224-4</identifier>
</relatedItem>
<abstract>Automatic keyword extraction from scientific articles is pivotal for organizing scholarly archives, powering semantic search engines, and mapping interdisciplinary research trends. However, existing methods—including statistical and graph-based approaches—struggle to handle domain-specific challenges such as technical terminology, cross-disciplinary ambiguity, and dynamic scientific jargon. This paper presents an empirical comparison of traditional keyword extraction methods (e.g. TextRank and YAKE) with approaches based on Large Language Model. We introduce a novel evaluation framework that combines fuzzy semantic matching based on Levenshtein Distance with exact-match metrics (F1, precision, recall) to address inconsistencies in keyword normalization across scientific corpora. Through an extensive ablation study across nine different LLMs, we analyze their performance and associated costs. Our findings reveal that LLM-based methods consistently achieve superior precision and relevance compared to traditional approaches. This performance advantage suggests significant potential for improving scientific search systems and information retrieval in academic contexts.</abstract>
<identifier type="citekey">mansour-etal-2025-well</identifier>
<identifier type="doi">10.18653/v1/2025.aisd-main.2</identifier>
<location>
<url>https://aclanthology.org/2025.aisd-main.2/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>13</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How Well Do Large Language Models Extract Keywords? A Systematic Evaluation on Scientific Corpora
%A Mansour, Nacef Ben
%A Rahimi, Hamed
%A Alrahabi, Motasem
%Y Jansen, Peter
%Y Dalvi Mishra, Bhavana
%Y Trivedi, Harsh
%Y Prasad Majumder, Bodhisattwa
%Y Hope, Tom
%Y Khot, Tushar
%Y Downey, Doug
%Y Horvitz, Eric
%S Proceedings of the 1st Workshop on AI and Scientific Discovery: Directions and Opportunities
%D 2025
%8 May
%I Association for Computational Linguistics
%C Albuquerque, New Mexico, USA
%@ 979-8-89176-224-4
%F mansour-etal-2025-well
%X Automatic keyword extraction from scientific articles is pivotal for organizing scholarly archives, powering semantic search engines, and mapping interdisciplinary research trends. However, existing methods—including statistical and graph-based approaches—struggle to handle domain-specific challenges such as technical terminology, cross-disciplinary ambiguity, and dynamic scientific jargon. This paper presents an empirical comparison of traditional keyword extraction methods (e.g. TextRank and YAKE) with approaches based on Large Language Model. We introduce a novel evaluation framework that combines fuzzy semantic matching based on Levenshtein Distance with exact-match metrics (F1, precision, recall) to address inconsistencies in keyword normalization across scientific corpora. Through an extensive ablation study across nine different LLMs, we analyze their performance and associated costs. Our findings reveal that LLM-based methods consistently achieve superior precision and relevance compared to traditional approaches. This performance advantage suggests significant potential for improving scientific search systems and information retrieval in academic contexts.
%R 10.18653/v1/2025.aisd-main.2
%U https://aclanthology.org/2025.aisd-main.2/
%U https://doi.org/10.18653/v1/2025.aisd-main.2
%P 13-21
Markdown (Informal)
[How Well Do Large Language Models Extract Keywords? A Systematic Evaluation on Scientific Corpora](https://aclanthology.org/2025.aisd-main.2/) (Mansour et al., AISD 2025)
ACL