@inproceedings{avram-etal-2025-morovoc,
title = "{M}o{R}o{V}oc: A Large Dataset for Geographical Variation Identification of the Spoken {R}omanian Language",
author = "Avram, Andrei-Marius and
Ema-Ioana, B{\u{a}}nescu and
Robea, Anda-Teodora and
Cercel, Dumitru-Clementin and
Cercel, Mihaela-Claudia",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.653/",
doi = "10.18653/v1/2025.findings-emnlp.653",
pages = "12207--12216",
ISBN = "979-8-89176-335-7",
abstract = "This paper introduces MoRoVoc, the largest dataset for analyzing the regional variation of spoken Romanian. It has more than 93 hours of audio and 88,192 audio samples, balanced between the Romanian language spoken in Romania and the Republic of Moldova. We further propose a multi-target adversarial training framework for speech models that incorporates demographic attributes (i.e., age and gender of the speakers) as adversarial targets, making models discriminative for primary tasks while remaining invariant to secondary attributes. The adversarial coefficients are dynamically adjusted via meta-learning to optimize performance. Our approach yields notable gains: Wav2Vec2-Base achieves 78.21{\%} accuracy for the variation identification of spoken Romanian using gender as an adversarial target, while Wav2Vec2-Large reaches 93.08{\%} accuracy for gender classification when employing both dialect and age as adversarial objectives."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="avram-etal-2025-morovoc">
<titleInfo>
<title>MoRoVoc: A Large Dataset for Geographical Variation Identification of the Spoken Romanian Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrei-Marius</namePart>
<namePart type="family">Avram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bănescu</namePart>
<namePart type="family">Ema-Ioana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anda-Teodora</namePart>
<namePart type="family">Robea</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dumitru-Clementin</namePart>
<namePart type="family">Cercel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihaela-Claudia</namePart>
<namePart type="family">Cercel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>This paper introduces MoRoVoc, the largest dataset for analyzing the regional variation of spoken Romanian. It has more than 93 hours of audio and 88,192 audio samples, balanced between the Romanian language spoken in Romania and the Republic of Moldova. We further propose a multi-target adversarial training framework for speech models that incorporates demographic attributes (i.e., age and gender of the speakers) as adversarial targets, making models discriminative for primary tasks while remaining invariant to secondary attributes. The adversarial coefficients are dynamically adjusted via meta-learning to optimize performance. Our approach yields notable gains: Wav2Vec2-Base achieves 78.21% accuracy for the variation identification of spoken Romanian using gender as an adversarial target, while Wav2Vec2-Large reaches 93.08% accuracy for gender classification when employing both dialect and age as adversarial objectives.</abstract>
<identifier type="citekey">avram-etal-2025-morovoc</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.653</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.653/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>12207</start>
<end>12216</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MoRoVoc: A Large Dataset for Geographical Variation Identification of the Spoken Romanian Language
%A Avram, Andrei-Marius
%A Ema-Ioana, Bănescu
%A Robea, Anda-Teodora
%A Cercel, Dumitru-Clementin
%A Cercel, Mihaela-Claudia
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F avram-etal-2025-morovoc
%X This paper introduces MoRoVoc, the largest dataset for analyzing the regional variation of spoken Romanian. It has more than 93 hours of audio and 88,192 audio samples, balanced between the Romanian language spoken in Romania and the Republic of Moldova. We further propose a multi-target adversarial training framework for speech models that incorporates demographic attributes (i.e., age and gender of the speakers) as adversarial targets, making models discriminative for primary tasks while remaining invariant to secondary attributes. The adversarial coefficients are dynamically adjusted via meta-learning to optimize performance. Our approach yields notable gains: Wav2Vec2-Base achieves 78.21% accuracy for the variation identification of spoken Romanian using gender as an adversarial target, while Wav2Vec2-Large reaches 93.08% accuracy for gender classification when employing both dialect and age as adversarial objectives.
%R 10.18653/v1/2025.findings-emnlp.653
%U https://aclanthology.org/2025.findings-emnlp.653/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.653
%P 12207-12216
Markdown (Informal)
[MoRoVoc: A Large Dataset for Geographical Variation Identification of the Spoken Romanian Language](https://aclanthology.org/2025.findings-emnlp.653/) (Avram et al., Findings 2025)
ACL