@inproceedings{gold-etal-2023-preserving,
title = "Preserving the Authenticity of Handwritten Learner Language: Annotation Guidelines for Creating Transcripts Retaining Orthographic Features",
author = "Gold, Christian and
Laarmann-quante, Ronja and
Zesch, Torsten",
editor = "Gorman, Kyle and
Sproat, Richard and
Roark, Brian",
booktitle = "Proceedings of the Workshop on Computation and Written Language (CAWL 2023)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.cawl-1.3",
doi = "10.18653/v1/2023.cawl-1.3",
pages = "14--21",
abstract = "Handwritten texts produced by young learners often contain orthographic features like spelling errors, capitalization errors, punctuation errors, and impurities such as strikethroughs, inserts, and smudges. All of those are typically normalized or ignored in existing transcriptions. For applications like handwriting recognition with the goal of automatically analyzing a learner{'}s language performance, however, retaining such features would be necessary. To address this, we present transcription guidelines that retain the features addressed above. Our guidelines were developed iteratively and include numerous example images to illustrate the various issues. On a subset of about 90 double-transcribed texts, we compute inter-annotator agreement and show that our guidelines can be applied with high levels of percentage agreement of about .98. Overall, we transcribed 1,350 learner texts, which is about the same size as the widely adopted handwriting recognition datasets IAM (1,500 pages) and CVL (1,600 pages). Our final corpus can be used to train a handwriting recognition system that transcribes closely to the real productions by young learners. Such a system is a prerequisite for applying automatic orthography feedback systems to handwritten texts in the future.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gold-etal-2023-preserving">
<titleInfo>
<title>Preserving the Authenticity of Handwritten Learner Language: Annotation Guidelines for Creating Transcripts Retaining Orthographic Features</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Gold</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ronja</namePart>
<namePart type="family">Laarmann-quante</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Torsten</namePart>
<namePart type="family">Zesch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Workshop on Computation and Written Language (CAWL 2023)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Gorman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Sproat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Roark</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Handwritten texts produced by young learners often contain orthographic features like spelling errors, capitalization errors, punctuation errors, and impurities such as strikethroughs, inserts, and smudges. All of those are typically normalized or ignored in existing transcriptions. For applications like handwriting recognition with the goal of automatically analyzing a learner’s language performance, however, retaining such features would be necessary. To address this, we present transcription guidelines that retain the features addressed above. Our guidelines were developed iteratively and include numerous example images to illustrate the various issues. On a subset of about 90 double-transcribed texts, we compute inter-annotator agreement and show that our guidelines can be applied with high levels of percentage agreement of about .98. Overall, we transcribed 1,350 learner texts, which is about the same size as the widely adopted handwriting recognition datasets IAM (1,500 pages) and CVL (1,600 pages). Our final corpus can be used to train a handwriting recognition system that transcribes closely to the real productions by young learners. Such a system is a prerequisite for applying automatic orthography feedback systems to handwritten texts in the future.</abstract>
<identifier type="citekey">gold-etal-2023-preserving</identifier>
<identifier type="doi">10.18653/v1/2023.cawl-1.3</identifier>
<location>
<url>https://aclanthology.org/2023.cawl-1.3</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>14</start>
<end>21</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Preserving the Authenticity of Handwritten Learner Language: Annotation Guidelines for Creating Transcripts Retaining Orthographic Features
%A Gold, Christian
%A Laarmann-quante, Ronja
%A Zesch, Torsten
%Y Gorman, Kyle
%Y Sproat, Richard
%Y Roark, Brian
%S Proceedings of the Workshop on Computation and Written Language (CAWL 2023)
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F gold-etal-2023-preserving
%X Handwritten texts produced by young learners often contain orthographic features like spelling errors, capitalization errors, punctuation errors, and impurities such as strikethroughs, inserts, and smudges. All of those are typically normalized or ignored in existing transcriptions. For applications like handwriting recognition with the goal of automatically analyzing a learner’s language performance, however, retaining such features would be necessary. To address this, we present transcription guidelines that retain the features addressed above. Our guidelines were developed iteratively and include numerous example images to illustrate the various issues. On a subset of about 90 double-transcribed texts, we compute inter-annotator agreement and show that our guidelines can be applied with high levels of percentage agreement of about .98. Overall, we transcribed 1,350 learner texts, which is about the same size as the widely adopted handwriting recognition datasets IAM (1,500 pages) and CVL (1,600 pages). Our final corpus can be used to train a handwriting recognition system that transcribes closely to the real productions by young learners. Such a system is a prerequisite for applying automatic orthography feedback systems to handwritten texts in the future.
%R 10.18653/v1/2023.cawl-1.3
%U https://aclanthology.org/2023.cawl-1.3
%U https://doi.org/10.18653/v1/2023.cawl-1.3
%P 14-21
Markdown (Informal)
[Preserving the Authenticity of Handwritten Learner Language: Annotation Guidelines for Creating Transcripts Retaining Orthographic Features](https://aclanthology.org/2023.cawl-1.3) (Gold et al., CAWL 2023)
ACL