@inproceedings{nedey-etal-2026-ocwikidialects,
title = "{O}c{W}iki{D}ialects: A {W}ikipedia Dataset With Rich Metadata for {O}ccitan Dialect Identification",
author = "N{\'e}dey, Oriane and
Bawden, Rachel and
Cl{\'e}rice, Thibault and
Sagot, Beno{\^i}t",
booktitle = "Proceedings of the 13th Workshop on {NLP} for Similar Languages, Varieties and Dialects",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.vardial-1.4/",
pages = "45--57",
abstract = "Occitan is a Romance language spoken mostly in the South of France and characterised by rich dialectal variation, which can pose problems for certain NLP tools. This shortfall is largely attributable to the scarcity of dialect-annotated corpora, in a context where linguistic classification within the Occitan dialect continuum is still debated and major nomenclatures, such as ISO 639, fail to provide granular codes for varieties below the generic ``Occitan'' label. In this paper, we introduce OcWikiDialects, a new dataset comprising articles from the Occitan Wikipedia. The corpus features rich metadata, including dialect labels, and is segmented at both paragraph and sentence levels. Combined with previously released datasets, we explore approaches for Occitan dialect identification by training three types of model on up to 8 labels: linear SVM classifiers based on word and character n-grams, FastText classifiers based on pretrained vectors, and BERT-based neural classifiers adapted through fine-tuning. Evaluations across in- and out-of-domain test sets demonstrate the substantial impact of our new dataset for the task. However, a peak macro-averaged F1 score of 58.15 underscores persistent challenges for underrepresented Occitan varieties, supported by our per-dialect analysis. Code, dataset and models are available: https://github.com/DEFI-COLaF/OcWikiDialects."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nedey-etal-2026-ocwikidialects">
<titleInfo>
<title>OcWikiDialects: A Wikipedia Dataset With Rich Metadata for Occitan Dialect Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oriane</namePart>
<namePart type="family">Nédey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachel</namePart>
<namePart type="family">Bawden</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thibault</namePart>
<namePart type="family">Clérice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benoît</namePart>
<namePart type="family">Sagot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Occitan is a Romance language spoken mostly in the South of France and characterised by rich dialectal variation, which can pose problems for certain NLP tools. This shortfall is largely attributable to the scarcity of dialect-annotated corpora, in a context where linguistic classification within the Occitan dialect continuum is still debated and major nomenclatures, such as ISO 639, fail to provide granular codes for varieties below the generic “Occitan” label. In this paper, we introduce OcWikiDialects, a new dataset comprising articles from the Occitan Wikipedia. The corpus features rich metadata, including dialect labels, and is segmented at both paragraph and sentence levels. Combined with previously released datasets, we explore approaches for Occitan dialect identification by training three types of model on up to 8 labels: linear SVM classifiers based on word and character n-grams, FastText classifiers based on pretrained vectors, and BERT-based neural classifiers adapted through fine-tuning. Evaluations across in- and out-of-domain test sets demonstrate the substantial impact of our new dataset for the task. However, a peak macro-averaged F1 score of 58.15 underscores persistent challenges for underrepresented Occitan varieties, supported by our per-dialect analysis. Code, dataset and models are available: https://github.com/DEFI-COLaF/OcWikiDialects.</abstract>
<identifier type="citekey">nedey-etal-2026-ocwikidialects</identifier>
<location>
<url>https://aclanthology.org/2026.vardial-1.4/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>45</start>
<end>57</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OcWikiDialects: A Wikipedia Dataset With Rich Metadata for Occitan Dialect Identification
%A Nédey, Oriane
%A Bawden, Rachel
%A Clérice, Thibault
%A Sagot, Benoît
%S Proceedings of the 13th Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F nedey-etal-2026-ocwikidialects
%X Occitan is a Romance language spoken mostly in the South of France and characterised by rich dialectal variation, which can pose problems for certain NLP tools. This shortfall is largely attributable to the scarcity of dialect-annotated corpora, in a context where linguistic classification within the Occitan dialect continuum is still debated and major nomenclatures, such as ISO 639, fail to provide granular codes for varieties below the generic “Occitan” label. In this paper, we introduce OcWikiDialects, a new dataset comprising articles from the Occitan Wikipedia. The corpus features rich metadata, including dialect labels, and is segmented at both paragraph and sentence levels. Combined with previously released datasets, we explore approaches for Occitan dialect identification by training three types of model on up to 8 labels: linear SVM classifiers based on word and character n-grams, FastText classifiers based on pretrained vectors, and BERT-based neural classifiers adapted through fine-tuning. Evaluations across in- and out-of-domain test sets demonstrate the substantial impact of our new dataset for the task. However, a peak macro-averaged F1 score of 58.15 underscores persistent challenges for underrepresented Occitan varieties, supported by our per-dialect analysis. Code, dataset and models are available: https://github.com/DEFI-COLaF/OcWikiDialects.
%U https://aclanthology.org/2026.vardial-1.4/
%P 45-57
Markdown (Informal)
[OcWikiDialects: A Wikipedia Dataset With Rich Metadata for Occitan Dialect Identification](https://aclanthology.org/2026.vardial-1.4/) (Nédey et al., VarDial 2026)
ACL