@inproceedings{abirami-etal-2024-aalamaram,
title = "Aalamaram: A Large-Scale Linguistically Annotated Treebank for the {T}amil Language",
author = "Abirami, A M and
Leong, Wei Qi and
Rengarajan, Hamsawardhini and
Anitha, D and
Suganya, R and
Singh, Himanshu and
Sarveswaran, Kengatharaiyer and
Tjhi, William Chandra and
Shah, Rajiv Ratn",
editor = "Jha, Girish Nath and
L., Sobha and
Bali, Kalika and
Ojha, Atul Kr.",
booktitle = "Proceedings of the 7th Workshop on Indian Language Data: Resources and Evaluation",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.wildre-1.11/",
pages = "73--83",
abstract = "Tamil is a relatively low-resource language in the field of Natural Language Processing (NLP). Recent years have seen a growth in Tamil NLP datasets in Natural Language Understanding (NLU) or Natural Language Generation (NLG) tasks, but high-quality linguistic resources remain scarce. In order to alleviate this gap in resources, this paper introduces Aalamaram, a treebank with rich linguistic annotations for the Tamil language. It is hitherto the largest publicly available Tamil treebank with almost 10,000 sentences from diverse sources and is annotated for the tasks of Part-of-speech (POS) tagging, Named Entity Recognition (NER), Morphological Parsing and Dependency Parsing. Close attention has also been paid to multi-word segmentation, especially in the context of Tamil clitics. Although the treebank is based largely on the Universal Dependencies (UD) specifications, significant effort has been made to adjust the annotation rules according to the idiosyncrasies and complexities of the Tamil language, thereby providing a valuable resource for linguistic research and NLP developments."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abirami-etal-2024-aalamaram">
<titleInfo>
<title>Aalamaram: A Large-Scale Linguistically Annotated Treebank for the Tamil Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">A</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Abirami</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="given">Qi</namePart>
<namePart type="family">Leong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamsawardhini</namePart>
<namePart type="family">Rengarajan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">D</namePart>
<namePart type="family">Anitha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">R</namePart>
<namePart type="family">Suganya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Himanshu</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kengatharaiyer</namePart>
<namePart type="family">Sarveswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">William</namePart>
<namePart type="given">Chandra</namePart>
<namePart type="family">Tjhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajiv</namePart>
<namePart type="given">Ratn</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Indian Language Data: Resources and Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Girish</namePart>
<namePart type="given">Nath</namePart>
<namePart type="family">Jha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">L.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atul</namePart>
<namePart type="given">Kr.</namePart>
<namePart type="family">Ojha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tamil is a relatively low-resource language in the field of Natural Language Processing (NLP). Recent years have seen a growth in Tamil NLP datasets in Natural Language Understanding (NLU) or Natural Language Generation (NLG) tasks, but high-quality linguistic resources remain scarce. In order to alleviate this gap in resources, this paper introduces Aalamaram, a treebank with rich linguistic annotations for the Tamil language. It is hitherto the largest publicly available Tamil treebank with almost 10,000 sentences from diverse sources and is annotated for the tasks of Part-of-speech (POS) tagging, Named Entity Recognition (NER), Morphological Parsing and Dependency Parsing. Close attention has also been paid to multi-word segmentation, especially in the context of Tamil clitics. Although the treebank is based largely on the Universal Dependencies (UD) specifications, significant effort has been made to adjust the annotation rules according to the idiosyncrasies and complexities of the Tamil language, thereby providing a valuable resource for linguistic research and NLP developments.</abstract>
<identifier type="citekey">abirami-etal-2024-aalamaram</identifier>
<location>
<url>https://aclanthology.org/2024.wildre-1.11/</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>73</start>
<end>83</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Aalamaram: A Large-Scale Linguistically Annotated Treebank for the Tamil Language
%A Abirami, A. M.
%A Leong, Wei Qi
%A Rengarajan, Hamsawardhini
%A Anitha, D.
%A Suganya, R.
%A Singh, Himanshu
%A Sarveswaran, Kengatharaiyer
%A Tjhi, William Chandra
%A Shah, Rajiv Ratn
%Y Jha, Girish Nath
%Y L., Sobha
%Y Bali, Kalika
%Y Ojha, Atul Kr.
%S Proceedings of the 7th Workshop on Indian Language Data: Resources and Evaluation
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F abirami-etal-2024-aalamaram
%X Tamil is a relatively low-resource language in the field of Natural Language Processing (NLP). Recent years have seen a growth in Tamil NLP datasets in Natural Language Understanding (NLU) or Natural Language Generation (NLG) tasks, but high-quality linguistic resources remain scarce. In order to alleviate this gap in resources, this paper introduces Aalamaram, a treebank with rich linguistic annotations for the Tamil language. It is hitherto the largest publicly available Tamil treebank with almost 10,000 sentences from diverse sources and is annotated for the tasks of Part-of-speech (POS) tagging, Named Entity Recognition (NER), Morphological Parsing and Dependency Parsing. Close attention has also been paid to multi-word segmentation, especially in the context of Tamil clitics. Although the treebank is based largely on the Universal Dependencies (UD) specifications, significant effort has been made to adjust the annotation rules according to the idiosyncrasies and complexities of the Tamil language, thereby providing a valuable resource for linguistic research and NLP developments.
%U https://aclanthology.org/2024.wildre-1.11/
%P 73-83
Markdown (Informal)
[Aalamaram: A Large-Scale Linguistically Annotated Treebank for the Tamil Language](https://aclanthology.org/2024.wildre-1.11/) (Abirami et al., WILDRE 2024)
ACL
- A M Abirami, Wei Qi Leong, Hamsawardhini Rengarajan, D Anitha, R Suganya, Himanshu Singh, Kengatharaiyer Sarveswaran, William Chandra Tjhi, and Rajiv Ratn Shah. 2024. Aalamaram: A Large-Scale Linguistically Annotated Treebank for the Tamil Language. In Proceedings of the 7th Workshop on Indian Language Data: Resources and Evaluation, pages 73–83, Torino, Italia. ELRA and ICCL.