@inproceedings{tyers-etal-2023-codex,
title = "Codex to corpus: Exploring annotation and processing for an open and extensible machine-readable edition of the Florentine Codex",
author = "Tyers, Francis and
Pugh, Robert and
Berthoud F., Valery",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Oncevay, Arturo and
Rice, Enora and
Rijhwani, Shruti and
Palmer, Alexis and
Kann, Katharina",
booktitle = "Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.americasnlp-1.4",
doi = "10.18653/v1/2023.americasnlp-1.4",
pages = "19--29",
abstract = "This paper describes an ongoing effort to create, from the original hand-written text, a machine-readable, linguistically-annotated, and easily-searchable corpus of the Nahuatl portion of the Florentine Codex, a 16th century Mesoamerican manuscript written in Nahuatl and Spanish. The Codex consists of 12 books and over 300,000 tokens. We describe the process of annotating 3 of these books, the steps of text preprocessing undertaken, our approach to efficient manual processing and annotation, and some of the challenges faced along the way. We also report on a set of experiments evaluating our ability to automate the text processing tasks to aid in the remaining annotation effort, and find the results promising despite the relatively low volume of training data. Finally, we briefly present a real use case from the humanities that would benefit from the searchable, linguistically annotated corpus we describe.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tyers-etal-2023-codex">
<titleInfo>
<title>Codex to corpus: Exploring annotation and processing for an open and extensible machine-readable edition of the Florentine Codex</title>
</titleInfo>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Tyers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Pugh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valery</namePart>
<namePart type="family">Berthoud F.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Mager</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abteen</namePart>
<namePart type="family">Ebrahimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arturo</namePart>
<namePart type="family">Oncevay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enora</namePart>
<namePart type="family">Rice</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexis</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katharina</namePart>
<namePart type="family">Kann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes an ongoing effort to create, from the original hand-written text, a machine-readable, linguistically-annotated, and easily-searchable corpus of the Nahuatl portion of the Florentine Codex, a 16th century Mesoamerican manuscript written in Nahuatl and Spanish. The Codex consists of 12 books and over 300,000 tokens. We describe the process of annotating 3 of these books, the steps of text preprocessing undertaken, our approach to efficient manual processing and annotation, and some of the challenges faced along the way. We also report on a set of experiments evaluating our ability to automate the text processing tasks to aid in the remaining annotation effort, and find the results promising despite the relatively low volume of training data. Finally, we briefly present a real use case from the humanities that would benefit from the searchable, linguistically annotated corpus we describe.</abstract>
<identifier type="citekey">tyers-etal-2023-codex</identifier>
<identifier type="doi">10.18653/v1/2023.americasnlp-1.4</identifier>
<location>
<url>https://aclanthology.org/2023.americasnlp-1.4</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>19</start>
<end>29</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Codex to corpus: Exploring annotation and processing for an open and extensible machine-readable edition of the Florentine Codex
%A Tyers, Francis
%A Pugh, Robert
%A Berthoud F., Valery
%Y Mager, Manuel
%Y Ebrahimi, Abteen
%Y Oncevay, Arturo
%Y Rice, Enora
%Y Rijhwani, Shruti
%Y Palmer, Alexis
%Y Kann, Katharina
%S Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F tyers-etal-2023-codex
%X This paper describes an ongoing effort to create, from the original hand-written text, a machine-readable, linguistically-annotated, and easily-searchable corpus of the Nahuatl portion of the Florentine Codex, a 16th century Mesoamerican manuscript written in Nahuatl and Spanish. The Codex consists of 12 books and over 300,000 tokens. We describe the process of annotating 3 of these books, the steps of text preprocessing undertaken, our approach to efficient manual processing and annotation, and some of the challenges faced along the way. We also report on a set of experiments evaluating our ability to automate the text processing tasks to aid in the remaining annotation effort, and find the results promising despite the relatively low volume of training data. Finally, we briefly present a real use case from the humanities that would benefit from the searchable, linguistically annotated corpus we describe.
%R 10.18653/v1/2023.americasnlp-1.4
%U https://aclanthology.org/2023.americasnlp-1.4
%U https://doi.org/10.18653/v1/2023.americasnlp-1.4
%P 19-29
Markdown (Informal)
[Codex to corpus: Exploring annotation and processing for an open and extensible machine-readable edition of the Florentine Codex](https://aclanthology.org/2023.americasnlp-1.4) (Tyers et al., AmericasNLP 2023)
ACL