@inproceedings{agarwal-etal-2025-developing,
title = "Developing a Mixed-Methods Pipeline for Community-Oriented Digitization of Kwak{'}wala Legacy Texts",
author = "Agarwal, Milind and
Anastasopoulos, Antonios and
Rosenblum, Daisy",
editor = "Lachler, Jordan and
Agyapong, Godfred and
Arppe, Antti and
Moeller, Sarah and
Chaudhary, Aditi and
Rijhwani, Shruti and
Rosenblum, Daisy",
booktitle = "Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = mar,
year = "2025",
address = "Honolulu, Hawaii, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.computel-main.15/",
pages = "133--138",
abstract = "Kwak{'}wala is an Indigenous language spoken in British Columbia, with a rich legacy of pub- lished documentation spanning more than a century, and an active community of speakers, teachers, and learners engaged in language revi- talization. Over 11 volumes of the earliest texts created during the collaboration between Franz Boas and George Hunt have been scanned but remain unreadable by machines. Complete dig- itization through optical character recognition has the potential to facilitate transliteration into modern orthographies and the creation of other language technologies. In this paper, we ap- ply the latest OCR techniques to a series of Kwak{'}wala texts only accessible as images, and discuss the challenges and unique adaptations necessary to make such technologies work for these real-world texts. Building on previous methods, we propose using a mix of off-the- shelf OCR methods, language identification, and masking to effectively isolate Kwak{'}wala text, along with post-correction models, to pro- duce a final high-quality transcription."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="agarwal-etal-2025-developing">
<titleInfo>
<title>Developing a Mixed-Methods Pipeline for Community-Oriented Digitization of Kwak’wala Legacy Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Milind</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisy</namePart>
<namePart type="family">Rosenblum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Lachler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Godfred</namePart>
<namePart type="family">Agyapong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antti</namePart>
<namePart type="family">Arppe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Moeller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditi</namePart>
<namePart type="family">Chaudhary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisy</namePart>
<namePart type="family">Rosenblum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Honolulu, Hawaii, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Kwak’wala is an Indigenous language spoken in British Columbia, with a rich legacy of pub- lished documentation spanning more than a century, and an active community of speakers, teachers, and learners engaged in language revi- talization. Over 11 volumes of the earliest texts created during the collaboration between Franz Boas and George Hunt have been scanned but remain unreadable by machines. Complete dig- itization through optical character recognition has the potential to facilitate transliteration into modern orthographies and the creation of other language technologies. In this paper, we ap- ply the latest OCR techniques to a series of Kwak’wala texts only accessible as images, and discuss the challenges and unique adaptations necessary to make such technologies work for these real-world texts. Building on previous methods, we propose using a mix of off-the- shelf OCR methods, language identification, and masking to effectively isolate Kwak’wala text, along with post-correction models, to pro- duce a final high-quality transcription.</abstract>
<identifier type="citekey">agarwal-etal-2025-developing</identifier>
<location>
<url>https://aclanthology.org/2025.computel-main.15/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>133</start>
<end>138</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Developing a Mixed-Methods Pipeline for Community-Oriented Digitization of Kwak’wala Legacy Texts
%A Agarwal, Milind
%A Anastasopoulos, Antonios
%A Rosenblum, Daisy
%Y Lachler, Jordan
%Y Agyapong, Godfred
%Y Arppe, Antti
%Y Moeller, Sarah
%Y Chaudhary, Aditi
%Y Rijhwani, Shruti
%Y Rosenblum, Daisy
%S Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages
%D 2025
%8 March
%I Association for Computational Linguistics
%C Honolulu, Hawaii, USA
%F agarwal-etal-2025-developing
%X Kwak’wala is an Indigenous language spoken in British Columbia, with a rich legacy of pub- lished documentation spanning more than a century, and an active community of speakers, teachers, and learners engaged in language revi- talization. Over 11 volumes of the earliest texts created during the collaboration between Franz Boas and George Hunt have been scanned but remain unreadable by machines. Complete dig- itization through optical character recognition has the potential to facilitate transliteration into modern orthographies and the creation of other language technologies. In this paper, we ap- ply the latest OCR techniques to a series of Kwak’wala texts only accessible as images, and discuss the challenges and unique adaptations necessary to make such technologies work for these real-world texts. Building on previous methods, we propose using a mix of off-the- shelf OCR methods, language identification, and masking to effectively isolate Kwak’wala text, along with post-correction models, to pro- duce a final high-quality transcription.
%U https://aclanthology.org/2025.computel-main.15/
%P 133-138
Markdown (Informal)
[Developing a Mixed-Methods Pipeline for Community-Oriented Digitization of Kwak’wala Legacy Texts](https://aclanthology.org/2025.computel-main.15/) (Agarwal et al., ComputEL 2025)
ACL