@inproceedings{nithiyananthan-etal-2025-tsd,
title = "{TSD}: Towards Computational Processing of {T}amil Similes - A {T}amil Simile Dataset",
author = "Nithiyananthan, Aathavan and
Raveendra, Jathushan and
Thayasivam, Uthayasanker",
editor = "Chakravarthi, Bharathi Raja and
Priyadharshini, Ruba and
Madasamy, Anand Kumar and
Thavareesan, Sajeetha and
Sherly, Elizabeth and
Rajiakodi, Saranya and
Palani, Balasubramanian and
Subramanian, Malliga and
Cn, Subalalitha and
Chinnappa, Dhivya",
booktitle = "Proceedings of the Fifth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages",
month = may,
year = "2025",
address = "Acoma, The Albuquerque Convention Center, Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.dravidianlangtech-1.99/",
doi = "10.18653/v1/2025.dravidianlangtech-1.99",
pages = "573--579",
ISBN = "979-8-89176-228-2",
abstract = "A simile is a powerful figure of speech that makes a comparison between two different things via shared properties, often using words like ``like'' or ``as'' to create vivid imagery, convey emotions, and enhance understanding. However, computational research on similes is limited in low-resource languages like Tamil due to the lack of simile datasets. This work introduces a manually annotated Tamil Simile Dataset (TSD) comprising around 1.5k simile sentences drawn from various sources. Our data annotation guidelines ensure that all the simile sentences are annotated with the three components, namely tenor, vehicle, and context. We benchmark our dataset for simile interpretation and simile generation tasks using chosen pre-trained language models (PLMs) and present the results. Our findings highlight the challenges of simile tasks in Tamil, suggesting areas for further improvement. We believe that TSD will drive progress in computational simile processing for Tamil and other low-resource languages, further advancing simile related tasks in Natural Language Processing."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nithiyananthan-etal-2025-tsd">
<titleInfo>
<title>TSD: Towards Computational Processing of Tamil Similes - A Tamil Simile Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aathavan</namePart>
<namePart type="family">Nithiyananthan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jathushan</namePart>
<namePart type="family">Raveendra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Uthayasanker</namePart>
<namePart type="family">Thayasivam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bharathi</namePart>
<namePart type="given">Raja</namePart>
<namePart type="family">Chakravarthi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruba</namePart>
<namePart type="family">Priyadharshini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anand</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Madasamy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sajeetha</namePart>
<namePart type="family">Thavareesan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elizabeth</namePart>
<namePart type="family">Sherly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saranya</namePart>
<namePart type="family">Rajiakodi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Balasubramanian</namePart>
<namePart type="family">Palani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malliga</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Subalalitha</namePart>
<namePart type="family">Cn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhivya</namePart>
<namePart type="family">Chinnappa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Acoma, The Albuquerque Convention Center, Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-228-2</identifier>
</relatedItem>
<abstract>A simile is a powerful figure of speech that makes a comparison between two different things via shared properties, often using words like “like” or “as” to create vivid imagery, convey emotions, and enhance understanding. However, computational research on similes is limited in low-resource languages like Tamil due to the lack of simile datasets. This work introduces a manually annotated Tamil Simile Dataset (TSD) comprising around 1.5k simile sentences drawn from various sources. Our data annotation guidelines ensure that all the simile sentences are annotated with the three components, namely tenor, vehicle, and context. We benchmark our dataset for simile interpretation and simile generation tasks using chosen pre-trained language models (PLMs) and present the results. Our findings highlight the challenges of simile tasks in Tamil, suggesting areas for further improvement. We believe that TSD will drive progress in computational simile processing for Tamil and other low-resource languages, further advancing simile related tasks in Natural Language Processing.</abstract>
<identifier type="citekey">nithiyananthan-etal-2025-tsd</identifier>
<identifier type="doi">10.18653/v1/2025.dravidianlangtech-1.99</identifier>
<location>
<url>https://aclanthology.org/2025.dravidianlangtech-1.99/</url>
</location>
<part>
<date>2025-05</date>
<extent unit="page">
<start>573</start>
<end>579</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TSD: Towards Computational Processing of Tamil Similes - A Tamil Simile Dataset
%A Nithiyananthan, Aathavan
%A Raveendra, Jathushan
%A Thayasivam, Uthayasanker
%Y Chakravarthi, Bharathi Raja
%Y Priyadharshini, Ruba
%Y Madasamy, Anand Kumar
%Y Thavareesan, Sajeetha
%Y Sherly, Elizabeth
%Y Rajiakodi, Saranya
%Y Palani, Balasubramanian
%Y Subramanian, Malliga
%Y Cn, Subalalitha
%Y Chinnappa, Dhivya
%S Proceedings of the Fifth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages
%D 2025
%8 May
%I Association for Computational Linguistics
%C Acoma, The Albuquerque Convention Center, Albuquerque, New Mexico
%@ 979-8-89176-228-2
%F nithiyananthan-etal-2025-tsd
%X A simile is a powerful figure of speech that makes a comparison between two different things via shared properties, often using words like “like” or “as” to create vivid imagery, convey emotions, and enhance understanding. However, computational research on similes is limited in low-resource languages like Tamil due to the lack of simile datasets. This work introduces a manually annotated Tamil Simile Dataset (TSD) comprising around 1.5k simile sentences drawn from various sources. Our data annotation guidelines ensure that all the simile sentences are annotated with the three components, namely tenor, vehicle, and context. We benchmark our dataset for simile interpretation and simile generation tasks using chosen pre-trained language models (PLMs) and present the results. Our findings highlight the challenges of simile tasks in Tamil, suggesting areas for further improvement. We believe that TSD will drive progress in computational simile processing for Tamil and other low-resource languages, further advancing simile related tasks in Natural Language Processing.
%R 10.18653/v1/2025.dravidianlangtech-1.99
%U https://aclanthology.org/2025.dravidianlangtech-1.99/
%U https://doi.org/10.18653/v1/2025.dravidianlangtech-1.99
%P 573-579
Markdown (Informal)
[TSD: Towards Computational Processing of Tamil Similes - A Tamil Simile Dataset](https://aclanthology.org/2025.dravidianlangtech-1.99/) (Nithiyananthan et al., DravidianLangTech 2025)
ACL