@inproceedings{perera-sumanathilaka-2025-evaluating,
title = "Evaluating Transliteration Ambiguity in Adhoc {R}omanized {S}inhala: A Dataset for Transliteration Disambiguation",
author = "Perera, Sandun Sameera and
Sumanathilaka, Deshan Koshala",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.107/",
pages = "934--942",
abstract = "This paper introduces the first Transliteration disambiguation (TD) dataset for Romanized Sinhala, informally known as Singlish, developed to address the challenge of transliteration ambiguity in backwards transliteration tasks. The dataset covers 22 ambiguous Romanized Sinhala words, each mapping to two distinct Sinhala meanings, and provides 30 Romanized sentences per word: ten for each meaning individually and ten containing both meanings in context. Sentences were initially collected through web scraping and later post-processed using the Claude language model, which offers strong support for Sinhala, alongside a rule-based Romanization process to ensure linguistic quality and consistency. To demonstrate its applicability, the dataset was used to evaluate four existing back-transliteration systems, highlighting their performance in resolving context-sensitive ambiguities. Baseline evaluations confirm the dataset{'}s effectiveness in assessing transliteration systems' ability to handle transliteration ambiguity, offering a valuable resource for advancing TD and transliteration research for Sinhala."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="perera-sumanathilaka-2025-evaluating">
<titleInfo>
<title>Evaluating Transliteration Ambiguity in Adhoc Romanized Sinhala: A Dataset for Transliteration Disambiguation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sandun</namePart>
<namePart type="given">Sameera</namePart>
<namePart type="family">Perera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Deshan</namePart>
<namePart type="given">Koshala</namePart>
<namePart type="family">Sumanathilaka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces the first Transliteration disambiguation (TD) dataset for Romanized Sinhala, informally known as Singlish, developed to address the challenge of transliteration ambiguity in backwards transliteration tasks. The dataset covers 22 ambiguous Romanized Sinhala words, each mapping to two distinct Sinhala meanings, and provides 30 Romanized sentences per word: ten for each meaning individually and ten containing both meanings in context. Sentences were initially collected through web scraping and later post-processed using the Claude language model, which offers strong support for Sinhala, alongside a rule-based Romanization process to ensure linguistic quality and consistency. To demonstrate its applicability, the dataset was used to evaluate four existing back-transliteration systems, highlighting their performance in resolving context-sensitive ambiguities. Baseline evaluations confirm the dataset’s effectiveness in assessing transliteration systems’ ability to handle transliteration ambiguity, offering a valuable resource for advancing TD and transliteration research for Sinhala.</abstract>
<identifier type="citekey">perera-sumanathilaka-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.107/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>934</start>
<end>942</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Transliteration Ambiguity in Adhoc Romanized Sinhala: A Dataset for Transliteration Disambiguation
%A Perera, Sandun Sameera
%A Sumanathilaka, Deshan Koshala
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F perera-sumanathilaka-2025-evaluating
%X This paper introduces the first Transliteration disambiguation (TD) dataset for Romanized Sinhala, informally known as Singlish, developed to address the challenge of transliteration ambiguity in backwards transliteration tasks. The dataset covers 22 ambiguous Romanized Sinhala words, each mapping to two distinct Sinhala meanings, and provides 30 Romanized sentences per word: ten for each meaning individually and ten containing both meanings in context. Sentences were initially collected through web scraping and later post-processed using the Claude language model, which offers strong support for Sinhala, alongside a rule-based Romanization process to ensure linguistic quality and consistency. To demonstrate its applicability, the dataset was used to evaluate four existing back-transliteration systems, highlighting their performance in resolving context-sensitive ambiguities. Baseline evaluations confirm the dataset’s effectiveness in assessing transliteration systems’ ability to handle transliteration ambiguity, offering a valuable resource for advancing TD and transliteration research for Sinhala.
%U https://aclanthology.org/2025.ranlp-1.107/
%P 934-942
Markdown (Informal)
[Evaluating Transliteration Ambiguity in Adhoc Romanized Sinhala: A Dataset for Transliteration Disambiguation](https://aclanthology.org/2025.ranlp-1.107/) (Perera & Sumanathilaka, RANLP 2025)
ACL