@inproceedings{bengoetxea-etal-2025-hitz,
title = "{H}i{TZ} at {V}ar{D}ial 2025 {N}or{SID}: Overcoming Data Scarcity with Language Transfer and Automatic Data Annotation",
author = "Bengoetxea, Jaione and
Zubillaga, Mikel and
Azurmendi, Ekhi and
Heredia, Maite and
Etxaniz, Julen and
Ferro, Markel and
Barnes, Jeremy",
editor = "Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, Jorg and
Zampieri, Marcos",
booktitle = "Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.vardial-1.16/",
pages = "209--219",
abstract = "In this paper we present our submission for the NorSID Shared Task as part of the 2025 VarDial Workshop, consisting of three tasks: Intent Detection, Slot Filling and Dialect Identification, evaluated using data in different dialects of the Norwegian language. For Intent Detection and Slot Filling, we have fine-tuned a multitask model in a cross-lingual setting, to leverage the xSID dataset available in 17 languages. In the case of Dialect Identification, our final submission consists of a model fine-tuned on the provided development set, which has obtained the highest scores within our experiments. Our final results on the test set show that our models do not drop in performance compared to the development set, likely due to the domain-specificity of the dataset and the similar distribution of both subsets. Finally, we also report an in-depth analysis of the provided datasets and their artifacts, as well as other sets of experiments that have been carried out but did not yield the best results. Additionally, we present an analysis on the reasons why some methods have been more successful than others; mainly the impact of the combination of languages and domain-specificity of the training data on the results."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bengoetxea-etal-2025-hitz">
<titleInfo>
<title>HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language Transfer and Automatic Data Annotation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaione</namePart>
<namePart type="family">Bengoetxea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mikel</namePart>
<namePart type="family">Zubillaga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekhi</namePart>
<namePart type="family">Azurmendi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Heredia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julen</namePart>
<namePart type="family">Etxaniz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Markel</namePart>
<namePart type="family">Ferro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Barnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tommi</namePart>
<namePart type="family">Jauhiainen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jorg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we present our submission for the NorSID Shared Task as part of the 2025 VarDial Workshop, consisting of three tasks: Intent Detection, Slot Filling and Dialect Identification, evaluated using data in different dialects of the Norwegian language. For Intent Detection and Slot Filling, we have fine-tuned a multitask model in a cross-lingual setting, to leverage the xSID dataset available in 17 languages. In the case of Dialect Identification, our final submission consists of a model fine-tuned on the provided development set, which has obtained the highest scores within our experiments. Our final results on the test set show that our models do not drop in performance compared to the development set, likely due to the domain-specificity of the dataset and the similar distribution of both subsets. Finally, we also report an in-depth analysis of the provided datasets and their artifacts, as well as other sets of experiments that have been carried out but did not yield the best results. Additionally, we present an analysis on the reasons why some methods have been more successful than others; mainly the impact of the combination of languages and domain-specificity of the training data on the results.</abstract>
<identifier type="citekey">bengoetxea-etal-2025-hitz</identifier>
<location>
<url>https://aclanthology.org/2025.vardial-1.16/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>209</start>
<end>219</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language Transfer and Automatic Data Annotation
%A Bengoetxea, Jaione
%A Zubillaga, Mikel
%A Azurmendi, Ekhi
%A Heredia, Maite
%A Etxaniz, Julen
%A Ferro, Markel
%A Barnes, Jeremy
%Y Scherrer, Yves
%Y Jauhiainen, Tommi
%Y Ljubešić, Nikola
%Y Nakov, Preslav
%Y Tiedemann, Jorg
%Y Zampieri, Marcos
%S Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F bengoetxea-etal-2025-hitz
%X In this paper we present our submission for the NorSID Shared Task as part of the 2025 VarDial Workshop, consisting of three tasks: Intent Detection, Slot Filling and Dialect Identification, evaluated using data in different dialects of the Norwegian language. For Intent Detection and Slot Filling, we have fine-tuned a multitask model in a cross-lingual setting, to leverage the xSID dataset available in 17 languages. In the case of Dialect Identification, our final submission consists of a model fine-tuned on the provided development set, which has obtained the highest scores within our experiments. Our final results on the test set show that our models do not drop in performance compared to the development set, likely due to the domain-specificity of the dataset and the similar distribution of both subsets. Finally, we also report an in-depth analysis of the provided datasets and their artifacts, as well as other sets of experiments that have been carried out but did not yield the best results. Additionally, we present an analysis on the reasons why some methods have been more successful than others; mainly the impact of the combination of languages and domain-specificity of the training data on the results.
%U https://aclanthology.org/2025.vardial-1.16/
%P 209-219
Markdown (Informal)
[HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language Transfer and Automatic Data Annotation](https://aclanthology.org/2025.vardial-1.16/) (Bengoetxea et al., VarDial 2025)
ACL
- Jaione Bengoetxea, Mikel Zubillaga, Ekhi Azurmendi, Maite Heredia, Julen Etxaniz, Markel Ferro, and Jeremy Barnes. 2025. HiTZ at VarDial 2025 NorSID: Overcoming Data Scarcity with Language Transfer and Automatic Data Annotation. In Proceedings of the 12th Workshop on NLP for Similar Languages, Varieties and Dialects, pages 209–219, Abu Dhabi, UAE. Association for Computational Linguistics.