@inproceedings{saleva-palen-michel-2024-brandeis,
title = "{B}randeis at {V}ar{D}ial 2024 {DSL}-{ML} Shared Task: Multilingual Models, Simple Baselines and Data Augmentation",
author = {S{\"a}lev{\"a}, Jonne and
Palen-Michel, Chester},
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Zampieri, Marcos and
Nakov, Preslav and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.vardial-1.22",
doi = "10.18653/v1/2024.vardial-1.22",
pages = "241--251",
abstract = "This paper describes the Brandeis University submission to VarDial 2024 DSL-ML Shared Task on multilabel classification for discriminating between similar languages. Our submission consists of three entries per language to the closed track, where no additional data was permitted. Our approach involves a set of simple non-neural baselines using logistic regression, random forests and support vector machines. We follow this by experimenting with finetuning multilingual BERT, either on a single language or all the languages concatenated together.In addition to benchmarking the model architectures against one another on the development set, we perform extensive hyperparameter tuning, which is afforded by the small size of the training data.Our experiments on the development set suggest that finetuned mBERT systems significantly benefit most languages compared to the baseline.However, on the test set, our results indicate that simple models based on scikit-learn can perform surprisingly well and even outperform pretrained language models, as we see with BCMS.Our submissions achieve the best performance on all languages as reported by the organizers. Except for Spanish and French, our non-neural baseline also ranks in the top 3 for all other languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saleva-palen-michel-2024-brandeis">
<titleInfo>
<title>Brandeis at VarDial 2024 DSL-ML Shared Task: Multilingual Models, Simple Baselines and Data Augmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonne</namePart>
<namePart type="family">Sälevä</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chester</namePart>
<namePart type="family">Palen-Michel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes the Brandeis University submission to VarDial 2024 DSL-ML Shared Task on multilabel classification for discriminating between similar languages. Our submission consists of three entries per language to the closed track, where no additional data was permitted. Our approach involves a set of simple non-neural baselines using logistic regression, random forests and support vector machines. We follow this by experimenting with finetuning multilingual BERT, either on a single language or all the languages concatenated together.In addition to benchmarking the model architectures against one another on the development set, we perform extensive hyperparameter tuning, which is afforded by the small size of the training data.Our experiments on the development set suggest that finetuned mBERT systems significantly benefit most languages compared to the baseline.However, on the test set, our results indicate that simple models based on scikit-learn can perform surprisingly well and even outperform pretrained language models, as we see with BCMS.Our submissions achieve the best performance on all languages as reported by the organizers. Except for Spanish and French, our non-neural baseline also ranks in the top 3 for all other languages.</abstract>
<identifier type="citekey">saleva-palen-michel-2024-brandeis</identifier>
<identifier type="doi">10.18653/v1/2024.vardial-1.22</identifier>
<location>
<url>https://aclanthology.org/2024.vardial-1.22</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>241</start>
<end>251</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Brandeis at VarDial 2024 DSL-ML Shared Task: Multilingual Models, Simple Baselines and Data Augmentation
%A Sälevä, Jonne
%A Palen-Michel, Chester
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Zampieri, Marcos
%Y Nakov, Preslav
%Y Tiedemann, Jörg
%S Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F saleva-palen-michel-2024-brandeis
%X This paper describes the Brandeis University submission to VarDial 2024 DSL-ML Shared Task on multilabel classification for discriminating between similar languages. Our submission consists of three entries per language to the closed track, where no additional data was permitted. Our approach involves a set of simple non-neural baselines using logistic regression, random forests and support vector machines. We follow this by experimenting with finetuning multilingual BERT, either on a single language or all the languages concatenated together.In addition to benchmarking the model architectures against one another on the development set, we perform extensive hyperparameter tuning, which is afforded by the small size of the training data.Our experiments on the development set suggest that finetuned mBERT systems significantly benefit most languages compared to the baseline.However, on the test set, our results indicate that simple models based on scikit-learn can perform surprisingly well and even outperform pretrained language models, as we see with BCMS.Our submissions achieve the best performance on all languages as reported by the organizers. Except for Spanish and French, our non-neural baseline also ranks in the top 3 for all other languages.
%R 10.18653/v1/2024.vardial-1.22
%U https://aclanthology.org/2024.vardial-1.22
%U https://doi.org/10.18653/v1/2024.vardial-1.22
%P 241-251
Markdown (Informal)
[Brandeis at VarDial 2024 DSL-ML Shared Task: Multilingual Models, Simple Baselines and Data Augmentation](https://aclanthology.org/2024.vardial-1.22) (Sälevä & Palen-Michel, VarDial-WS 2024)
ACL