@inproceedings{hill-etal-2026-catalogues-data,
title = "Catalogues as Data: Interpretable {NLP} Pipelines for {O}ttoman-{T}urkish Bibliographies",
author = "Hill, Mark and
Bulus, Ayse and
Spence, Paul",
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.latechclfl-1.12/",
pages = "128--134",
ISBN = "979-8-89176-373-9",
abstract = {Bibliographies are both humanities infrastructure and historic record. To computationally analyse them, however, requires implementing complex digitisation and standardisation decisions. This paper turns to Seyfettin {\"O}zege{'}s Eski Harflerle Bas{\i}lm{\i}{\c{s}} T{\"u}rk{\c{c}}e Eserler Katalo{\u{g}}u as an example, a scanned set of volumes marked by complex page layouts, degraded typography, irregular entry structures, and historically contingent inconsistencies. With this we present a pipeline that constructs a structured, machine-readable, and analysable dataset out of the 27,000 entries with computer vision, OCR, large and visual language models, sequence-based validation, and custom review tools. This process captures 97.8{\%} of records, with remaining cases capable of being addressed by targeted review. This process demonstrates that combining LLMs with interpretable, review-centric pipelines, offers an appropriate approach for historically complex bibliographic sources.}
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hill-etal-2026-catalogues-data">
<titleInfo>
<title>Catalogues as Data: Interpretable NLP Pipelines for Ottoman-Turkish Bibliographies</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Hill</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayse</namePart>
<namePart type="family">Bulus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Spence</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Alves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janis</namePart>
<namePart type="family">Pagel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-373-9</identifier>
</relatedItem>
<abstract>Bibliographies are both humanities infrastructure and historic record. To computationally analyse them, however, requires implementing complex digitisation and standardisation decisions. This paper turns to Seyfettin Özege’s Eski Harflerle Basılmış Türkçe Eserler Kataloğu as an example, a scanned set of volumes marked by complex page layouts, degraded typography, irregular entry structures, and historically contingent inconsistencies. With this we present a pipeline that constructs a structured, machine-readable, and analysable dataset out of the 27,000 entries with computer vision, OCR, large and visual language models, sequence-based validation, and custom review tools. This process captures 97.8% of records, with remaining cases capable of being addressed by targeted review. This process demonstrates that combining LLMs with interpretable, review-centric pipelines, offers an appropriate approach for historically complex bibliographic sources.</abstract>
<identifier type="citekey">hill-etal-2026-catalogues-data</identifier>
<location>
<url>https://aclanthology.org/2026.latechclfl-1.12/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>128</start>
<end>134</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Catalogues as Data: Interpretable NLP Pipelines for Ottoman-Turkish Bibliographies
%A Hill, Mark
%A Bulus, Ayse
%A Spence, Paul
%Y Alves, Diego
%Y Bizzoni, Yuri
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Pagel, Janis
%Y Szpakowicz, Stan
%S Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-373-9
%F hill-etal-2026-catalogues-data
%X Bibliographies are both humanities infrastructure and historic record. To computationally analyse them, however, requires implementing complex digitisation and standardisation decisions. This paper turns to Seyfettin Özege’s Eski Harflerle Basılmış Türkçe Eserler Kataloğu as an example, a scanned set of volumes marked by complex page layouts, degraded typography, irregular entry structures, and historically contingent inconsistencies. With this we present a pipeline that constructs a structured, machine-readable, and analysable dataset out of the 27,000 entries with computer vision, OCR, large and visual language models, sequence-based validation, and custom review tools. This process captures 97.8% of records, with remaining cases capable of being addressed by targeted review. This process demonstrates that combining LLMs with interpretable, review-centric pipelines, offers an appropriate approach for historically complex bibliographic sources.
%U https://aclanthology.org/2026.latechclfl-1.12/
%P 128-134
Markdown (Informal)
[Catalogues as Data: Interpretable NLP Pipelines for Ottoman-Turkish Bibliographies](https://aclanthology.org/2026.latechclfl-1.12/) (Hill et al., LaTeCH-CLfL 2026)
ACL