@inproceedings{partanen-etal-2021-processing,
title = "Processing {M}.{A}. Castr{\'e}n{'}s Materials: Multilingual Historical Typed and Handwritten Manuscripts",
author = {Partanen, Niko and
Rueter, Jack and
Alnajjar, Khalid and
H{\"a}m{\"a}l{\"a}inen, Mika},
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Alnajjar, Khalid and
Partanen, Niko and
Rueter, Jack},
booktitle = "Proceedings of the Workshop on Natural Language Processing for Digital Humanities",
month = dec,
year = "2021",
address = "NIT Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.nlp4dh-1.6",
pages = "47--54",
abstract = "The study forms a technical report of various tasks that have been performed on the materials collected and published by Finnish ethnographer and linguist, Matthias Alexander Castr{\'e}n (1813{--}1852). The Finno-Ugrian Society is publishing Castr{\'e}n{'}s manuscripts as new critical and digital editions, and at the same time different research groups have also paid attention to these materials. We discuss the workflows and technical infrastructure used, and consider how datasets that benefit different computational tasks could be created to further improve the usability of these materials, and also to aid the further processing of similar archived collections. We specifically focus on the parts of the collections that are processed in a way that improves their usability in more technical applications, complementing the earlier work on the cultural and linguistic aspects of these materials. Most of these datasets are openly available in Zenodo. The study points to specific areas where further research is needed, and provides benchmarks for text recognition tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="partanen-etal-2021-processing">
<titleInfo>
<title>Processing M.A. Castrén’s Materials: Multilingual Historical Typed and Handwritten Manuscripts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Niko</namePart>
<namePart type="family">Partanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Natural Language Processing for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Alnajjar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niko</namePart>
<namePart type="family">Partanen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jack</namePart>
<namePart type="family">Rueter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">NIT Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The study forms a technical report of various tasks that have been performed on the materials collected and published by Finnish ethnographer and linguist, Matthias Alexander Castrén (1813–1852). The Finno-Ugrian Society is publishing Castrén’s manuscripts as new critical and digital editions, and at the same time different research groups have also paid attention to these materials. We discuss the workflows and technical infrastructure used, and consider how datasets that benefit different computational tasks could be created to further improve the usability of these materials, and also to aid the further processing of similar archived collections. We specifically focus on the parts of the collections that are processed in a way that improves their usability in more technical applications, complementing the earlier work on the cultural and linguistic aspects of these materials. Most of these datasets are openly available in Zenodo. The study points to specific areas where further research is needed, and provides benchmarks for text recognition tasks.</abstract>
<identifier type="citekey">partanen-etal-2021-processing</identifier>
<location>
<url>https://aclanthology.org/2021.nlp4dh-1.6</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>47</start>
<end>54</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Processing M.A. Castrén’s Materials: Multilingual Historical Typed and Handwritten Manuscripts
%A Partanen, Niko
%A Rueter, Jack
%A Alnajjar, Khalid
%A Hämäläinen, Mika
%Y Hämäläinen, Mika
%Y Alnajjar, Khalid
%Y Partanen, Niko
%Y Rueter, Jack
%S Proceedings of the Workshop on Natural Language Processing for Digital Humanities
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C NIT Silchar, India
%F partanen-etal-2021-processing
%X The study forms a technical report of various tasks that have been performed on the materials collected and published by Finnish ethnographer and linguist, Matthias Alexander Castrén (1813–1852). The Finno-Ugrian Society is publishing Castrén’s manuscripts as new critical and digital editions, and at the same time different research groups have also paid attention to these materials. We discuss the workflows and technical infrastructure used, and consider how datasets that benefit different computational tasks could be created to further improve the usability of these materials, and also to aid the further processing of similar archived collections. We specifically focus on the parts of the collections that are processed in a way that improves their usability in more technical applications, complementing the earlier work on the cultural and linguistic aspects of these materials. Most of these datasets are openly available in Zenodo. The study points to specific areas where further research is needed, and provides benchmarks for text recognition tasks.
%U https://aclanthology.org/2021.nlp4dh-1.6
%P 47-54
Markdown (Informal)
[Processing M.A. Castrén’s Materials: Multilingual Historical Typed and Handwritten Manuscripts](https://aclanthology.org/2021.nlp4dh-1.6) (Partanen et al., NLP4DH 2021)
ACL