@inproceedings{hopton-aepli-2024-modeling,
title = "Modeling Orthographic Variation in {O}ccitan{'}s Dialects",
author = {Hopton, Zachary and
Aepli, No{\"e}mi},
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Zampieri, Marcos and
Nakov, Preslav and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.vardial-1.6",
doi = "10.18653/v1/2024.vardial-1.6",
pages = "78--88",
abstract = "Effectively normalizing spellings in textual data poses a considerable challenge, especially for low-resource languages lacking standardized writing systems. In this study, we fine-tuned a multilingual model with data from several Occitan dialects and conducted a series of experiments to assess the model{'}s representations of these dialects. For evaluation purposes, we compiled a parallel lexicon encompassing four Occitan dialects.Intrinsic evaluations of the model{'}s embeddings revealed that surface similarity between the dialects strengthened representations. When the model was further fine-tuned for part-of-speech tagging, its performance was robust to dialectical variation, even when trained solely on part-of-speech data from a single dialect. Our findings suggest that large multilingual models minimize the need for spelling normalization during pre-processing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hopton-aepli-2024-modeling">
<titleInfo>
<title>Modeling Orthographic Variation in Occitan’s Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zachary</namePart>
<namePart type="family">Hopton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noëmi</namePart>
<namePart type="family">Aepli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Effectively normalizing spellings in textual data poses a considerable challenge, especially for low-resource languages lacking standardized writing systems. In this study, we fine-tuned a multilingual model with data from several Occitan dialects and conducted a series of experiments to assess the model’s representations of these dialects. For evaluation purposes, we compiled a parallel lexicon encompassing four Occitan dialects.Intrinsic evaluations of the model’s embeddings revealed that surface similarity between the dialects strengthened representations. When the model was further fine-tuned for part-of-speech tagging, its performance was robust to dialectical variation, even when trained solely on part-of-speech data from a single dialect. Our findings suggest that large multilingual models minimize the need for spelling normalization during pre-processing.</abstract>
<identifier type="citekey">hopton-aepli-2024-modeling</identifier>
<identifier type="doi">10.18653/v1/2024.vardial-1.6</identifier>
<location>
<url>https://aclanthology.org/2024.vardial-1.6</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>78</start>
<end>88</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modeling Orthographic Variation in Occitan’s Dialects
%A Hopton, Zachary
%A Aepli, Noëmi
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Zampieri, Marcos
%Y Nakov, Preslav
%Y Tiedemann, Jörg
%S Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F hopton-aepli-2024-modeling
%X Effectively normalizing spellings in textual data poses a considerable challenge, especially for low-resource languages lacking standardized writing systems. In this study, we fine-tuned a multilingual model with data from several Occitan dialects and conducted a series of experiments to assess the model’s representations of these dialects. For evaluation purposes, we compiled a parallel lexicon encompassing four Occitan dialects.Intrinsic evaluations of the model’s embeddings revealed that surface similarity between the dialects strengthened representations. When the model was further fine-tuned for part-of-speech tagging, its performance was robust to dialectical variation, even when trained solely on part-of-speech data from a single dialect. Our findings suggest that large multilingual models minimize the need for spelling normalization during pre-processing.
%R 10.18653/v1/2024.vardial-1.6
%U https://aclanthology.org/2024.vardial-1.6
%U https://doi.org/10.18653/v1/2024.vardial-1.6
%P 78-88
Markdown (Informal)
[Modeling Orthographic Variation in Occitan’s Dialects](https://aclanthology.org/2024.vardial-1.6) (Hopton & Aepli, VarDial-WS 2024)
ACL
- Zachary Hopton and Noëmi Aepli. 2024. Modeling Orthographic Variation in Occitan’s Dialects. In Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024), pages 78–88, Mexico City, Mexico. Association for Computational Linguistics.