@inproceedings{blstak-etal-2025-dictionary,
title = "When the Dictionary Strikes Back: A Case Study on {S}lovak Migration Location Term Extraction and {NER} via Rule-Based vs. {LLM} Methods",
author = "Bl{\v{s}}t{\'a}k, Miroslav and
Kop{\v{c}}an, Jaroslav and
Suppa, Marek and
Havran, Samuel and
Findor, Andrej and
Takac, Martin and
Simko, Marian",
editor = "Piskorski, Jakub and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Nakov, Preslav and
Yangarber, Roman and
Marcinczuk, Michal",
booktitle = "Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bsnlp-1.11/",
doi = "10.18653/v1/2025.bsnlp-1.11",
pages = "91--100",
ISBN = "978-1-959429-57-9",
abstract = "This study explores the task of automatically extracting migration-related locations (source and destination) from media articles, focusing on the challenges posed by Slovak, a low-resource and morphologically complex language. We present the first comparative analysis of rule-based dictionary approaches (NLP4SK) versus Large Language Models (LLMs, e.g. SlovakBERT, GPT-4o) for both geographical relevance classification (Slovakia-focused migration) and specific source/target location extraction. To facilitate this research and future work, we introduce the first manually annotated Slovak dataset tailored for migration-focused locality detection. Our results show that while a fine-tuned SlovakBERT model achieves high accuracy for classification, specialized rule-based methods still have the potential to outperform LLMs for specific extraction tasks, though improved LLM performance with few-shot examples suggests future competitiveness as research in this area continues to evolve."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="blstak-etal-2025-dictionary">
<titleInfo>
<title>When the Dictionary Strikes Back: A Case Study on Slovak Migration Location Term Extraction and NER via Rule-Based vs. LLM Methods</title>
</titleInfo>
<name type="personal">
<namePart type="given">Miroslav</namePart>
<namePart type="family">Blšták</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaroslav</namePart>
<namePart type="family">Kopčan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marek</namePart>
<namePart type="family">Suppa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Samuel</namePart>
<namePart type="family">Havran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrej</namePart>
<namePart type="family">Findor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Takac</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marian</namePart>
<namePart type="family">Simko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Marcinczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-1-959429-57-9</identifier>
</relatedItem>
<abstract>This study explores the task of automatically extracting migration-related locations (source and destination) from media articles, focusing on the challenges posed by Slovak, a low-resource and morphologically complex language. We present the first comparative analysis of rule-based dictionary approaches (NLP4SK) versus Large Language Models (LLMs, e.g. SlovakBERT, GPT-4o) for both geographical relevance classification (Slovakia-focused migration) and specific source/target location extraction. To facilitate this research and future work, we introduce the first manually annotated Slovak dataset tailored for migration-focused locality detection. Our results show that while a fine-tuned SlovakBERT model achieves high accuracy for classification, specialized rule-based methods still have the potential to outperform LLMs for specific extraction tasks, though improved LLM performance with few-shot examples suggests future competitiveness as research in this area continues to evolve.</abstract>
<identifier type="citekey">blstak-etal-2025-dictionary</identifier>
<identifier type="doi">10.18653/v1/2025.bsnlp-1.11</identifier>
<location>
<url>https://aclanthology.org/2025.bsnlp-1.11/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>91</start>
<end>100</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When the Dictionary Strikes Back: A Case Study on Slovak Migration Location Term Extraction and NER via Rule-Based vs. LLM Methods
%A Blšták, Miroslav
%A Kopčan, Jaroslav
%A Suppa, Marek
%A Havran, Samuel
%A Findor, Andrej
%A Takac, Martin
%A Simko, Marian
%Y Piskorski, Jakub
%Y Přibáň, Pavel
%Y Nakov, Preslav
%Y Yangarber, Roman
%Y Marcinczuk, Michal
%S Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 978-1-959429-57-9
%F blstak-etal-2025-dictionary
%X This study explores the task of automatically extracting migration-related locations (source and destination) from media articles, focusing on the challenges posed by Slovak, a low-resource and morphologically complex language. We present the first comparative analysis of rule-based dictionary approaches (NLP4SK) versus Large Language Models (LLMs, e.g. SlovakBERT, GPT-4o) for both geographical relevance classification (Slovakia-focused migration) and specific source/target location extraction. To facilitate this research and future work, we introduce the first manually annotated Slovak dataset tailored for migration-focused locality detection. Our results show that while a fine-tuned SlovakBERT model achieves high accuracy for classification, specialized rule-based methods still have the potential to outperform LLMs for specific extraction tasks, though improved LLM performance with few-shot examples suggests future competitiveness as research in this area continues to evolve.
%R 10.18653/v1/2025.bsnlp-1.11
%U https://aclanthology.org/2025.bsnlp-1.11/
%U https://doi.org/10.18653/v1/2025.bsnlp-1.11
%P 91-100
Markdown (Informal)
[When the Dictionary Strikes Back: A Case Study on Slovak Migration Location Term Extraction and NER via Rule-Based vs. LLM Methods](https://aclanthology.org/2025.bsnlp-1.11/) (Blšták et al., BSNLP 2025)
ACL