@inproceedings{codrut-etal-2024-rodia,
title = "{R}o{D}ia: A New Dataset for {R}omanian Dialect Identification from Speech",
author = "Codru{\textcommabelow{t}}, Rotaru and
Ristea, Nicolae and
Ionescu, Radu",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.20",
pages = "279--286",
abstract = "We introduce RoDia, the first dataset for Romanian dialect identification from speech. The RoDia dataset includes a varied compilation of speech samples from five distinct regions of Romania, covering both urban and rural environments, totaling 2 hours of manually annotated speech data. Along with our dataset, we introduce a set of competitive models to be used as baselines for future research. The top scoring model achieves a macro F1 score of 59.83{\%} and a micro F1 score of 62.08{\%}, indicating that the task is challenging. We thus believe that RoDia is a valuable resource that will stimulate research aiming to address the challenges of Romanian dialect identification. We release our dataset at https://github.com/codrut2/RoDia.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="codrut-etal-2024-rodia">
<titleInfo>
<title>RoDia: A New Dataset for Romanian Dialect Identification from Speech</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rotaru</namePart>
<namePart type="family">Codru\textcommabelowt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolae</namePart>
<namePart type="family">Ristea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="family">Ionescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce RoDia, the first dataset for Romanian dialect identification from speech. The RoDia dataset includes a varied compilation of speech samples from five distinct regions of Romania, covering both urban and rural environments, totaling 2 hours of manually annotated speech data. Along with our dataset, we introduce a set of competitive models to be used as baselines for future research. The top scoring model achieves a macro F1 score of 59.83% and a micro F1 score of 62.08%, indicating that the task is challenging. We thus believe that RoDia is a valuable resource that will stimulate research aiming to address the challenges of Romanian dialect identification. We release our dataset at https://github.com/codrut2/RoDia.</abstract>
<identifier type="citekey">codrut-etal-2024-rodia</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.20</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>279</start>
<end>286</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T RoDia: A New Dataset for Romanian Dialect Identification from Speech
%A Codru\textcommabelowt, Rotaru
%A Ristea, Nicolae
%A Ionescu, Radu
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F codrut-etal-2024-rodia
%X We introduce RoDia, the first dataset for Romanian dialect identification from speech. The RoDia dataset includes a varied compilation of speech samples from five distinct regions of Romania, covering both urban and rural environments, totaling 2 hours of manually annotated speech data. Along with our dataset, we introduce a set of competitive models to be used as baselines for future research. The top scoring model achieves a macro F1 score of 59.83% and a micro F1 score of 62.08%, indicating that the task is challenging. We thus believe that RoDia is a valuable resource that will stimulate research aiming to address the challenges of Romanian dialect identification. We release our dataset at https://github.com/codrut2/RoDia.
%U https://aclanthology.org/2024.findings-naacl.20
%P 279-286
Markdown (Informal)
[RoDia: A New Dataset for Romanian Dialect Identification from Speech](https://aclanthology.org/2024.findings-naacl.20) (Codruț et al., Findings 2024)
ACL