@inproceedings{prasanna-arora-2024-irumozhi,
title = "{I}ru{M}ozhi: Automatically classifying diglossia in {T}amil",
author = "Prasanna, Kabilan and
Arora, Aryaman",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-naacl.195",
doi = "10.18653/v1/2024.findings-naacl.195",
pages = "3096--3103",
abstract = "Tamil, a Dravidian language of South Asia, is a highly diglossic language with two very different registers in everyday use: Literary Tamil (preferred in writing and formal communication) and Spoken Tamil (confined to speech and informal media). Spoken Tamil is under-studied in modern NLP systems compared to Literary Tamil written in the Tamil script, as evidenced by a lack of datasets explicitly targetting the Spoken variety. In this paper, we release IruMozhi, a human-translated dataset of parallel text in Literary and Spoken Tamil. Using IruMozhi, we train classifiers on the task of identifying which Tamil variety a text belongs to. We use these models to gauge the availability of pretraining data in Spoken Tamil, to audit the composition of existing labelled datasets for Tamil, and to encourage future work on the variety.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="prasanna-arora-2024-irumozhi">
<titleInfo>
<title>IruMozhi: Automatically classifying diglossia in Tamil</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kabilan</namePart>
<namePart type="family">Prasanna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aryaman</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: NAACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tamil, a Dravidian language of South Asia, is a highly diglossic language with two very different registers in everyday use: Literary Tamil (preferred in writing and formal communication) and Spoken Tamil (confined to speech and informal media). Spoken Tamil is under-studied in modern NLP systems compared to Literary Tamil written in the Tamil script, as evidenced by a lack of datasets explicitly targetting the Spoken variety. In this paper, we release IruMozhi, a human-translated dataset of parallel text in Literary and Spoken Tamil. Using IruMozhi, we train classifiers on the task of identifying which Tamil variety a text belongs to. We use these models to gauge the availability of pretraining data in Spoken Tamil, to audit the composition of existing labelled datasets for Tamil, and to encourage future work on the variety.</abstract>
<identifier type="citekey">prasanna-arora-2024-irumozhi</identifier>
<identifier type="doi">10.18653/v1/2024.findings-naacl.195</identifier>
<location>
<url>https://aclanthology.org/2024.findings-naacl.195</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>3096</start>
<end>3103</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T IruMozhi: Automatically classifying diglossia in Tamil
%A Prasanna, Kabilan
%A Arora, Aryaman
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Findings of the Association for Computational Linguistics: NAACL 2024
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F prasanna-arora-2024-irumozhi
%X Tamil, a Dravidian language of South Asia, is a highly diglossic language with two very different registers in everyday use: Literary Tamil (preferred in writing and formal communication) and Spoken Tamil (confined to speech and informal media). Spoken Tamil is under-studied in modern NLP systems compared to Literary Tamil written in the Tamil script, as evidenced by a lack of datasets explicitly targetting the Spoken variety. In this paper, we release IruMozhi, a human-translated dataset of parallel text in Literary and Spoken Tamil. Using IruMozhi, we train classifiers on the task of identifying which Tamil variety a text belongs to. We use these models to gauge the availability of pretraining data in Spoken Tamil, to audit the composition of existing labelled datasets for Tamil, and to encourage future work on the variety.
%R 10.18653/v1/2024.findings-naacl.195
%U https://aclanthology.org/2024.findings-naacl.195
%U https://doi.org/10.18653/v1/2024.findings-naacl.195
%P 3096-3103
Markdown (Informal)
[IruMozhi: Automatically classifying diglossia in Tamil](https://aclanthology.org/2024.findings-naacl.195) (Prasanna & Arora, Findings 2024)
ACL