@inproceedings{sandhan-etal-2022-prabhupadavani,
title = "Prabhupadavani: A Code-mixed Speech Translation Data for 25 Languages",
author = "Sandhan, Jivnesh and
Daksh, Ayush and
Paranjay, Om Adideva and
Behera, Laxmidhar and
Goyal, Pawan",
editor = "Degaetano, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 6th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2022.latechclfl-1.4",
pages = "24--29",
abstract = "Nowadays, the interest in code-mixing has become ubiquitous in Natural Language Processing (NLP); however, not much attention has been given to address this phenomenon for Speech Translation (ST) task. This can be solely attributed to the lack of code-mixed ST task labelled data. Thus, we introduce Prabhupadavani, which is a multilingual code-mixed ST dataset for 25 languages. It is multi-domain, covers ten language families, containing 94 hours of speech by 130+ speakers, manually aligned with corresponding text in the target language. The Prabhupadavani is about Vedic culture and heritage from Indic literature, where code-switching in the case of quotation from literature is important in the context of humanities teaching. To the best of our knowledge, Prabhupadvani is the first multi-lingual code-mixed ST dataset available in the ST literature. This data also can be used for a code-mixed machine translation task. All the dataset can be accessed at: \url{https://github.com/frozentoad9/CMST}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sandhan-etal-2022-prabhupadavani">
<titleInfo>
<title>Prabhupadavani: A Code-mixed Speech Translation Data for 25 Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jivnesh</namePart>
<namePart type="family">Sandhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayush</namePart>
<namePart type="family">Daksh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Om</namePart>
<namePart type="given">Adideva</namePart>
<namePart type="family">Paranjay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laxmidhar</namePart>
<namePart type="family">Behera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pawan</namePart>
<namePart type="family">Goyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature</title>
</titleInfo>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nils</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Nowadays, the interest in code-mixing has become ubiquitous in Natural Language Processing (NLP); however, not much attention has been given to address this phenomenon for Speech Translation (ST) task. This can be solely attributed to the lack of code-mixed ST task labelled data. Thus, we introduce Prabhupadavani, which is a multilingual code-mixed ST dataset for 25 languages. It is multi-domain, covers ten language families, containing 94 hours of speech by 130+ speakers, manually aligned with corresponding text in the target language. The Prabhupadavani is about Vedic culture and heritage from Indic literature, where code-switching in the case of quotation from literature is important in the context of humanities teaching. To the best of our knowledge, Prabhupadvani is the first multi-lingual code-mixed ST dataset available in the ST literature. This data also can be used for a code-mixed machine translation task. All the dataset can be accessed at: https://github.com/frozentoad9/CMST.</abstract>
<identifier type="citekey">sandhan-etal-2022-prabhupadavani</identifier>
<location>
<url>https://aclanthology.org/2022.latechclfl-1.4</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>24</start>
<end>29</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Prabhupadavani: A Code-mixed Speech Translation Data for 25 Languages
%A Sandhan, Jivnesh
%A Daksh, Ayush
%A Paranjay, Om Adideva
%A Behera, Laxmidhar
%A Goyal, Pawan
%Y Degaetano, Stefania
%Y Kazantseva, Anna
%Y Reiter, Nils
%Y Szpakowicz, Stan
%S Proceedings of the 6th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature
%D 2022
%8 October
%I International Conference on Computational Linguistics
%C Gyeongju, Republic of Korea
%F sandhan-etal-2022-prabhupadavani
%X Nowadays, the interest in code-mixing has become ubiquitous in Natural Language Processing (NLP); however, not much attention has been given to address this phenomenon for Speech Translation (ST) task. This can be solely attributed to the lack of code-mixed ST task labelled data. Thus, we introduce Prabhupadavani, which is a multilingual code-mixed ST dataset for 25 languages. It is multi-domain, covers ten language families, containing 94 hours of speech by 130+ speakers, manually aligned with corresponding text in the target language. The Prabhupadavani is about Vedic culture and heritage from Indic literature, where code-switching in the case of quotation from literature is important in the context of humanities teaching. To the best of our knowledge, Prabhupadvani is the first multi-lingual code-mixed ST dataset available in the ST literature. This data also can be used for a code-mixed machine translation task. All the dataset can be accessed at: https://github.com/frozentoad9/CMST.
%U https://aclanthology.org/2022.latechclfl-1.4
%P 24-29
Markdown (Informal)
[Prabhupadavani: A Code-mixed Speech Translation Data for 25 Languages](https://aclanthology.org/2022.latechclfl-1.4) (Sandhan et al., LaTeCHCLfL 2022)
ACL
- Jivnesh Sandhan, Ayush Daksh, Om Adideva Paranjay, Laxmidhar Behera, and Pawan Goyal. 2022. Prabhupadavani: A Code-mixed Speech Translation Data for 25 Languages. In Proceedings of the 6th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pages 24–29, Gyeongju, Republic of Korea. International Conference on Computational Linguistics.