@inproceedings{aralikatte-etal-2021-itihasa,
title = "Itihasa: A large-scale corpus for {S}anskrit to {E}nglish translation",
author = "Aralikatte, Rahul and
de Lhoneux, Miryam and
Kunchukuttan, Anoop and
S{\o}gaard, Anders",
booktitle = "Proceedings of the 8th Workshop on Asian Translation (WAT2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wat-1.22",
doi = "10.18653/v1/2021.wat-1.22",
pages = "191--197",
abstract = "This work introduces Itihasa, a large-scale translation dataset containing 93,000 pairs of Sanskrit shlokas and their English translations. The shlokas are extracted from two Indian epics viz., The Ramayana and The Mahabharata. We first describe the motivation behind the curation of such a dataset and follow up with empirical analysis to bring out its nuances. We then benchmark the performance of standard translation models on this corpus and show that even state-of-the-art transformer architectures perform poorly, emphasizing the complexity of the dataset.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aralikatte-etal-2021-itihasa">
<titleInfo>
<title>Itihasa: A large-scale corpus for Sanskrit to English translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Aralikatte</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miryam</namePart>
<namePart type="family">de Lhoneux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anoop</namePart>
<namePart type="family">Kunchukuttan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anders</namePart>
<namePart type="family">Søgaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 8th Workshop on Asian Translation (WAT2021)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work introduces Itihasa, a large-scale translation dataset containing 93,000 pairs of Sanskrit shlokas and their English translations. The shlokas are extracted from two Indian epics viz., The Ramayana and The Mahabharata. We first describe the motivation behind the curation of such a dataset and follow up with empirical analysis to bring out its nuances. We then benchmark the performance of standard translation models on this corpus and show that even state-of-the-art transformer architectures perform poorly, emphasizing the complexity of the dataset.</abstract>
<identifier type="citekey">aralikatte-etal-2021-itihasa</identifier>
<identifier type="doi">10.18653/v1/2021.wat-1.22</identifier>
<location>
<url>https://aclanthology.org/2021.wat-1.22</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>191</start>
<end>197</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Itihasa: A large-scale corpus for Sanskrit to English translation
%A Aralikatte, Rahul
%A de Lhoneux, Miryam
%A Kunchukuttan, Anoop
%A Søgaard, Anders
%S Proceedings of the 8th Workshop on Asian Translation (WAT2021)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F aralikatte-etal-2021-itihasa
%X This work introduces Itihasa, a large-scale translation dataset containing 93,000 pairs of Sanskrit shlokas and their English translations. The shlokas are extracted from two Indian epics viz., The Ramayana and The Mahabharata. We first describe the motivation behind the curation of such a dataset and follow up with empirical analysis to bring out its nuances. We then benchmark the performance of standard translation models on this corpus and show that even state-of-the-art transformer architectures perform poorly, emphasizing the complexity of the dataset.
%R 10.18653/v1/2021.wat-1.22
%U https://aclanthology.org/2021.wat-1.22
%U https://doi.org/10.18653/v1/2021.wat-1.22
%P 191-197
Markdown (Informal)
[Itihasa: A large-scale corpus for Sanskrit to English translation](https://aclanthology.org/2021.wat-1.22) (Aralikatte et al., WAT 2021)
ACL