@inproceedings{gutkin-etal-2022-extensions,
title = "Extensions to {B}rahmic script processing within the {N}isaba library: new scripts, languages and utilities",
author = "Gutkin, Alexander and
Johny, Cibu and
Doctor, Raiomond and
Wolf-Sonkin, Lawrence and
Roark, Brian",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.692/",
pages = "6450--6460",
abstract = "The Brahmic family of scripts is used to record some of the most spoken languages in the world and is arguably the most diverse family of writing systems. In this work, we present several substantial extensions to Brahmic script functionality within the open-source Nisaba library of finite-state script normalization and processing utilities (Johny et al., 2021). First, we extend coverage from the original ten scripts to an additional ten scripts of South Asia and beyond, including some used to record endangered languages such as Dogri. Second, we augment the language layer so that scripts used by multiple languages in distinct ways can be processed correctly for more languages, such as the Bengali script when used for the low-resource language Santali. We document key changes to the finite-state engine required to support these new languages and scripts. Finally, we add new script processing utilities, including lightweight script-level reading normalization that (unlike existing visual normalization) does not preserve visual invariance, and a fixed-input transliteration mechanism specifically tailored to Brahmic text entry with ASCII characters."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gutkin-etal-2022-extensions">
<titleInfo>
<title>Extensions to Brahmic script processing within the Nisaba library: new scripts, languages and utilities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Gutkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cibu</namePart>
<namePart type="family">Johny</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raiomond</namePart>
<namePart type="family">Doctor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lawrence</namePart>
<namePart type="family">Wolf-Sonkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The Brahmic family of scripts is used to record some of the most spoken languages in the world and is arguably the most diverse family of writing systems. In this work, we present several substantial extensions to Brahmic script functionality within the open-source Nisaba library of finite-state script normalization and processing utilities (Johny et al., 2021). First, we extend coverage from the original ten scripts to an additional ten scripts of South Asia and beyond, including some used to record endangered languages such as Dogri. Second, we augment the language layer so that scripts used by multiple languages in distinct ways can be processed correctly for more languages, such as the Bengali script when used for the low-resource language Santali. We document key changes to the finite-state engine required to support these new languages and scripts. Finally, we add new script processing utilities, including lightweight script-level reading normalization that (unlike existing visual normalization) does not preserve visual invariance, and a fixed-input transliteration mechanism specifically tailored to Brahmic text entry with ASCII characters.</abstract>
<identifier type="citekey">gutkin-etal-2022-extensions</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.692/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>6450</start>
<end>6460</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Extensions to Brahmic script processing within the Nisaba library: new scripts, languages and utilities
%A Gutkin, Alexander
%A Johny, Cibu
%A Doctor, Raiomond
%A Wolf-Sonkin, Lawrence
%A Roark, Brian
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F gutkin-etal-2022-extensions
%X The Brahmic family of scripts is used to record some of the most spoken languages in the world and is arguably the most diverse family of writing systems. In this work, we present several substantial extensions to Brahmic script functionality within the open-source Nisaba library of finite-state script normalization and processing utilities (Johny et al., 2021). First, we extend coverage from the original ten scripts to an additional ten scripts of South Asia and beyond, including some used to record endangered languages such as Dogri. Second, we augment the language layer so that scripts used by multiple languages in distinct ways can be processed correctly for more languages, such as the Bengali script when used for the low-resource language Santali. We document key changes to the finite-state engine required to support these new languages and scripts. Finally, we add new script processing utilities, including lightweight script-level reading normalization that (unlike existing visual normalization) does not preserve visual invariance, and a fixed-input transliteration mechanism specifically tailored to Brahmic text entry with ASCII characters.
%U https://aclanthology.org/2022.lrec-1.692/
%P 6450-6460
Markdown (Informal)
[Extensions to Brahmic script processing within the Nisaba library: new scripts, languages and utilities](https://aclanthology.org/2022.lrec-1.692/) (Gutkin et al., LREC 2022)
ACL