@inproceedings{bernardo-estuar-2025-bai,
title = "b{AI}-b{AI}: A Context-Aware Transliteration System for Baybayin Scripts",
author = "Bernardo, Jacob Simon D. and
Estuar, Maria Regina Justina E.",
editor = "Wijaya, Derry and
Aji, Alham Fikri and
Vania, Clara and
Winata, Genta Indra and
Purwarianti, Ayu",
booktitle = "Proceedings of the Second Workshop in South East Asian Language Processing",
month = jan,
year = "2025",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sealp-1.1/",
pages = "1--9",
abstract = "Baybayin, a pre-colonial writing system from the Philippines, has seen a resurgence in recent years. Research in computational linguistics has shown an increasing interest in Baybayin OCR, which focuses on the recognition and classification of script characters. However, existing studies face challenges with ambiguous Baybayin words that have multiple possible transliterations. This study introduces a disambiguation technique that employs word embeddings (WE) for contextual analysis and uses part-of-speech (POS) tagging as an initial filtering step. This approach is compared with an LLM method that prompts GPT-4o mini to determine the most appropriate transliteration given a sentence input. The proposed disambiguation process is integrated into existing Baybayin OCR systems to develop bAI-bAI, a context-aware Baybayin transliteration system capable of handling ambiguous words. Results show that incorporating POS as a filter does not significantly affect performance. The WE-Only method yields an accuracy of 77.46{\%} and takes 5.35ms to process one sample while leveraging GPT-4o mini peaks at a higher accuracy of 90.52{\%} but with a much longer runtime of 3280ms per sample. These findings present an opportunity to further explore and improve NLP approaches in disambiguation methods."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bernardo-estuar-2025-bai">
<titleInfo>
<title>bAI-bAI: A Context-Aware Transliteration System for Baybayin Scripts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="given">Simon</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Bernardo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">Regina</namePart>
<namePart type="given">Justina</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Estuar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop in South East Asian Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Derry</namePart>
<namePart type="family">Wijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clara</namePart>
<namePart type="family">Vania</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="given">Indra</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayu</namePart>
<namePart type="family">Purwarianti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Baybayin, a pre-colonial writing system from the Philippines, has seen a resurgence in recent years. Research in computational linguistics has shown an increasing interest in Baybayin OCR, which focuses on the recognition and classification of script characters. However, existing studies face challenges with ambiguous Baybayin words that have multiple possible transliterations. This study introduces a disambiguation technique that employs word embeddings (WE) for contextual analysis and uses part-of-speech (POS) tagging as an initial filtering step. This approach is compared with an LLM method that prompts GPT-4o mini to determine the most appropriate transliteration given a sentence input. The proposed disambiguation process is integrated into existing Baybayin OCR systems to develop bAI-bAI, a context-aware Baybayin transliteration system capable of handling ambiguous words. Results show that incorporating POS as a filter does not significantly affect performance. The WE-Only method yields an accuracy of 77.46% and takes 5.35ms to process one sample while leveraging GPT-4o mini peaks at a higher accuracy of 90.52% but with a much longer runtime of 3280ms per sample. These findings present an opportunity to further explore and improve NLP approaches in disambiguation methods.</abstract>
<identifier type="citekey">bernardo-estuar-2025-bai</identifier>
<location>
<url>https://aclanthology.org/2025.sealp-1.1/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1</start>
<end>9</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T bAI-bAI: A Context-Aware Transliteration System for Baybayin Scripts
%A Bernardo, Jacob Simon D.
%A Estuar, Maria Regina Justina E.
%Y Wijaya, Derry
%Y Aji, Alham Fikri
%Y Vania, Clara
%Y Winata, Genta Indra
%Y Purwarianti, Ayu
%S Proceedings of the Second Workshop in South East Asian Language Processing
%D 2025
%8 January
%I Association for Computational Linguistics
%C Online
%F bernardo-estuar-2025-bai
%X Baybayin, a pre-colonial writing system from the Philippines, has seen a resurgence in recent years. Research in computational linguistics has shown an increasing interest in Baybayin OCR, which focuses on the recognition and classification of script characters. However, existing studies face challenges with ambiguous Baybayin words that have multiple possible transliterations. This study introduces a disambiguation technique that employs word embeddings (WE) for contextual analysis and uses part-of-speech (POS) tagging as an initial filtering step. This approach is compared with an LLM method that prompts GPT-4o mini to determine the most appropriate transliteration given a sentence input. The proposed disambiguation process is integrated into existing Baybayin OCR systems to develop bAI-bAI, a context-aware Baybayin transliteration system capable of handling ambiguous words. Results show that incorporating POS as a filter does not significantly affect performance. The WE-Only method yields an accuracy of 77.46% and takes 5.35ms to process one sample while leveraging GPT-4o mini peaks at a higher accuracy of 90.52% but with a much longer runtime of 3280ms per sample. These findings present an opportunity to further explore and improve NLP approaches in disambiguation methods.
%U https://aclanthology.org/2025.sealp-1.1/
%P 1-9
Markdown (Informal)
[bAI-bAI: A Context-Aware Transliteration System for Baybayin Scripts](https://aclanthology.org/2025.sealp-1.1/) (Bernardo & Estuar, sealp 2025)
ACL