@inproceedings{debaene-hoste-2025-classifying,
title = "Classifying {TEI} Encoding for {D}utch{D}ra{C}or with Transformer Models",
author = "Debaene, Florian and
Hoste, Veronique",
editor = "Peng, Siyao and
Rehbein, Ines",
booktitle = "Proceedings of the 19th Linguistic Annotation Workshop (LAW-XIX-2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.law-1.11/",
doi = "10.18653/v1/2025.law-1.11",
pages = "137--141",
ISBN = "979-8-89176-262-6",
abstract = "Computational Drama Analysis relies on well-structured textual data, yet many dramatic works remain in need of encoding. The Dutch dramatic tradition is one such an example, with currently 180 plays available in the DraCor database, while many more plays await integration still. To facilitate this process, we propose a semi-automated TEI encoding annotation methodology using transformer encoder language models to classify structural elements in Dutch drama. We fine-tune 4 Dutch models on the DutchDraCor dataset to predict the 9 most relevant labels used in the DraCor TEI encoding, experimenting with 2 model input settings. Our results show that incorporating additional context through beginning-of-sequence (BOS) and end-of-sequence (EOS) tokens greatly improves performance, increasing the average macro F1 score across models from 0.717 to 0.923 (+0.206). Using the best-performing model, we generate silver-standard DraCor labels for EmDComF, an unstructured corpus of early modern Dutch comedies and farces, paving the way for its integration into DutchDraCor after validation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="debaene-hoste-2025-classifying">
<titleInfo>
<title>Classifying TEI Encoding for DutchDraCor with Transformer Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Florian</namePart>
<namePart type="family">Debaene</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Linguistic Annotation Workshop (LAW-XIX-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siyao</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ines</namePart>
<namePart type="family">Rehbein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-262-6</identifier>
</relatedItem>
<abstract>Computational Drama Analysis relies on well-structured textual data, yet many dramatic works remain in need of encoding. The Dutch dramatic tradition is one such an example, with currently 180 plays available in the DraCor database, while many more plays await integration still. To facilitate this process, we propose a semi-automated TEI encoding annotation methodology using transformer encoder language models to classify structural elements in Dutch drama. We fine-tune 4 Dutch models on the DutchDraCor dataset to predict the 9 most relevant labels used in the DraCor TEI encoding, experimenting with 2 model input settings. Our results show that incorporating additional context through beginning-of-sequence (BOS) and end-of-sequence (EOS) tokens greatly improves performance, increasing the average macro F1 score across models from 0.717 to 0.923 (+0.206). Using the best-performing model, we generate silver-standard DraCor labels for EmDComF, an unstructured corpus of early modern Dutch comedies and farces, paving the way for its integration into DutchDraCor after validation.</abstract>
<identifier type="citekey">debaene-hoste-2025-classifying</identifier>
<identifier type="doi">10.18653/v1/2025.law-1.11</identifier>
<location>
<url>https://aclanthology.org/2025.law-1.11/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>137</start>
<end>141</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Classifying TEI Encoding for DutchDraCor with Transformer Models
%A Debaene, Florian
%A Hoste, Veronique
%Y Peng, Siyao
%Y Rehbein, Ines
%S Proceedings of the 19th Linguistic Annotation Workshop (LAW-XIX-2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-262-6
%F debaene-hoste-2025-classifying
%X Computational Drama Analysis relies on well-structured textual data, yet many dramatic works remain in need of encoding. The Dutch dramatic tradition is one such an example, with currently 180 plays available in the DraCor database, while many more plays await integration still. To facilitate this process, we propose a semi-automated TEI encoding annotation methodology using transformer encoder language models to classify structural elements in Dutch drama. We fine-tune 4 Dutch models on the DutchDraCor dataset to predict the 9 most relevant labels used in the DraCor TEI encoding, experimenting with 2 model input settings. Our results show that incorporating additional context through beginning-of-sequence (BOS) and end-of-sequence (EOS) tokens greatly improves performance, increasing the average macro F1 score across models from 0.717 to 0.923 (+0.206). Using the best-performing model, we generate silver-standard DraCor labels for EmDComF, an unstructured corpus of early modern Dutch comedies and farces, paving the way for its integration into DutchDraCor after validation.
%R 10.18653/v1/2025.law-1.11
%U https://aclanthology.org/2025.law-1.11/
%U https://doi.org/10.18653/v1/2025.law-1.11
%P 137-141
Markdown (Informal)
[Classifying TEI Encoding for DutchDraCor with Transformer Models](https://aclanthology.org/2025.law-1.11/) (Debaene & Hoste, LAW 2025)
ACL