@inproceedings{thayasivam-etal-2025-sita,
title = "{S}i{T}a - {S}inhala and {T}amil Speaker Diarization Dataset in the Wild",
author = "Thayasivam, Uthayasanker and
Gnanenthiram, Thulasithan and
Jeewantha, Shamila and
Jayawickrama, Upeksha",
editor = "Sarveswaran, Kengatharaiyer and
Vaidya, Ashwini and
Krishna Bal, Bal and
Shams, Sana and
Thapa, Surendrabikram",
booktitle = "Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2025.chipsal-1.8/",
pages = "83--92",
abstract = "The dynamic field of speaker diarization continues to present significant challenges, despite notable advancements in recent years and the rising focus on complex acoustic scenarios emphasizes the importance of sustained research efforts in this area. While speech resources for speaker diarization are expanding rapidly, aided by semi-automated techniques, many existing datasets remain outdated and lack authentic real-world conversational data. This challenge is particularly acute for low-resource South Asian languages, due to limited public media data and reduced research efforts. Sinhala and Tamil are two such languages with limited speaker diarization datasets. To address this gap, we introduce a new speaker diarization dataset for these languages and evaluate multiple existing models to assess their performance. This work provides essential resources, a novel dataset and valuable insights from model benchmarks to advance speaker diarization for low-resource languages, particularly Sinhala and Tamil."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thayasivam-etal-2025-sita">
<titleInfo>
<title>SiTa - Sinhala and Tamil Speaker Diarization Dataset in the Wild</title>
</titleInfo>
<name type="personal">
<namePart type="given">Uthayasanker</namePart>
<namePart type="family">Thayasivam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thulasithan</namePart>
<namePart type="family">Gnanenthiram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shamila</namePart>
<namePart type="family">Jeewantha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Upeksha</namePart>
<namePart type="family">Jayawickrama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kengatharaiyer</namePart>
<namePart type="family">Sarveswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashwini</namePart>
<namePart type="family">Vaidya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bal</namePart>
<namePart type="family">Krishna Bal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sana</namePart>
<namePart type="family">Shams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The dynamic field of speaker diarization continues to present significant challenges, despite notable advancements in recent years and the rising focus on complex acoustic scenarios emphasizes the importance of sustained research efforts in this area. While speech resources for speaker diarization are expanding rapidly, aided by semi-automated techniques, many existing datasets remain outdated and lack authentic real-world conversational data. This challenge is particularly acute for low-resource South Asian languages, due to limited public media data and reduced research efforts. Sinhala and Tamil are two such languages with limited speaker diarization datasets. To address this gap, we introduce a new speaker diarization dataset for these languages and evaluate multiple existing models to assess their performance. This work provides essential resources, a novel dataset and valuable insights from model benchmarks to advance speaker diarization for low-resource languages, particularly Sinhala and Tamil.</abstract>
<identifier type="citekey">thayasivam-etal-2025-sita</identifier>
<location>
<url>https://aclanthology.org/2025.chipsal-1.8/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>83</start>
<end>92</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SiTa - Sinhala and Tamil Speaker Diarization Dataset in the Wild
%A Thayasivam, Uthayasanker
%A Gnanenthiram, Thulasithan
%A Jeewantha, Shamila
%A Jayawickrama, Upeksha
%Y Sarveswaran, Kengatharaiyer
%Y Vaidya, Ashwini
%Y Krishna Bal, Bal
%Y Shams, Sana
%Y Thapa, Surendrabikram
%S Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)
%D 2025
%8 January
%I International Committee on Computational Linguistics
%C Abu Dhabi, UAE
%F thayasivam-etal-2025-sita
%X The dynamic field of speaker diarization continues to present significant challenges, despite notable advancements in recent years and the rising focus on complex acoustic scenarios emphasizes the importance of sustained research efforts in this area. While speech resources for speaker diarization are expanding rapidly, aided by semi-automated techniques, many existing datasets remain outdated and lack authentic real-world conversational data. This challenge is particularly acute for low-resource South Asian languages, due to limited public media data and reduced research efforts. Sinhala and Tamil are two such languages with limited speaker diarization datasets. To address this gap, we introduce a new speaker diarization dataset for these languages and evaluate multiple existing models to assess their performance. This work provides essential resources, a novel dataset and valuable insights from model benchmarks to advance speaker diarization for low-resource languages, particularly Sinhala and Tamil.
%U https://aclanthology.org/2025.chipsal-1.8/
%P 83-92
Markdown (Informal)
[SiTa - Sinhala and Tamil Speaker Diarization Dataset in the Wild](https://aclanthology.org/2025.chipsal-1.8/) (Thayasivam et al., CHiPSAL 2025)
ACL
- Uthayasanker Thayasivam, Thulasithan Gnanenthiram, Shamila Jeewantha, and Upeksha Jayawickrama. 2025. SiTa - Sinhala and Tamil Speaker Diarization Dataset in the Wild. In Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025), pages 83–92, Abu Dhabi, UAE. International Committee on Computational Linguistics.