@inproceedings{galletti-etal-2025-keywords,
title = "Are Your Keywords Like My Queries? A Corpus-Wide Evaluation of Keyword Extractors with Real Searches",
author = "Galletti, Martina and
Prevedello, Giulio and
Brugnoli, Emanuele and
Lo Sardo, Donald Ruggiero and
Gravino, Pietro",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.133/",
pages = "1943--1951",
abstract = "Keyword Extraction (KE) is essential in Natural Language Processing (NLP) for identifying key terms that represent the main themes of a text, and it is vital for applications such as information retrieval, text summarisation, and document classification. Despite the development of various KE methods {---} including statistical approaches and advanced deep learning models {---} evaluating their effectiveness remains challenging. Current evaluation metrics focus on keyword quality, balance, and overlap with annotations from authors and professional indexers, but neglect real-world information retrieval needs. This paper introduces a novel evaluation method designed to overcome this limitation by using real query data from Google Trends and can be used with both supervised and unsupervised KE approaches. We applied this method to three popular KE approaches (YAKE, RAKE and KeyBERT) and found that KeyBERT was the most effective in capturing users' top queries, with RAKE also showing surprisingly good performance. The code is open-access and publicly available."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="galletti-etal-2025-keywords">
<titleInfo>
<title>Are Your Keywords Like My Queries? A Corpus-Wide Evaluation of Keyword Extractors with Real Searches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martina</namePart>
<namePart type="family">Galletti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Giulio</namePart>
<namePart type="family">Prevedello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emanuele</namePart>
<namePart type="family">Brugnoli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Donald</namePart>
<namePart type="given">Ruggiero</namePart>
<namePart type="family">Lo Sardo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pietro</namePart>
<namePart type="family">Gravino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Keyword Extraction (KE) is essential in Natural Language Processing (NLP) for identifying key terms that represent the main themes of a text, and it is vital for applications such as information retrieval, text summarisation, and document classification. Despite the development of various KE methods — including statistical approaches and advanced deep learning models — evaluating their effectiveness remains challenging. Current evaluation metrics focus on keyword quality, balance, and overlap with annotations from authors and professional indexers, but neglect real-world information retrieval needs. This paper introduces a novel evaluation method designed to overcome this limitation by using real query data from Google Trends and can be used with both supervised and unsupervised KE approaches. We applied this method to three popular KE approaches (YAKE, RAKE and KeyBERT) and found that KeyBERT was the most effective in capturing users’ top queries, with RAKE also showing surprisingly good performance. The code is open-access and publicly available.</abstract>
<identifier type="citekey">galletti-etal-2025-keywords</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.133/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1943</start>
<end>1951</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Are Your Keywords Like My Queries? A Corpus-Wide Evaluation of Keyword Extractors with Real Searches
%A Galletti, Martina
%A Prevedello, Giulio
%A Brugnoli, Emanuele
%A Lo Sardo, Donald Ruggiero
%A Gravino, Pietro
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F galletti-etal-2025-keywords
%X Keyword Extraction (KE) is essential in Natural Language Processing (NLP) for identifying key terms that represent the main themes of a text, and it is vital for applications such as information retrieval, text summarisation, and document classification. Despite the development of various KE methods — including statistical approaches and advanced deep learning models — evaluating their effectiveness remains challenging. Current evaluation metrics focus on keyword quality, balance, and overlap with annotations from authors and professional indexers, but neglect real-world information retrieval needs. This paper introduces a novel evaluation method designed to overcome this limitation by using real query data from Google Trends and can be used with both supervised and unsupervised KE approaches. We applied this method to three popular KE approaches (YAKE, RAKE and KeyBERT) and found that KeyBERT was the most effective in capturing users’ top queries, with RAKE also showing surprisingly good performance. The code is open-access and publicly available.
%U https://aclanthology.org/2025.coling-main.133/
%P 1943-1951
Markdown (Informal)
[Are Your Keywords Like My Queries? A Corpus-Wide Evaluation of Keyword Extractors with Real Searches](https://aclanthology.org/2025.coling-main.133/) (Galletti et al., COLING 2025)
ACL