@inproceedings{momen-etal-2025-filling,
title = "Filling the Temporal Void: Recovering Missing Publication Years in the {P}roject {G}utenberg Corpus Using {LLM}s",
author = "Momen, Omar and
Schaaf, Manuel and
Mehler, Alexander",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.890/",
doi = "10.18653/v1/2025.findings-acl.890",
pages = "17318--17334",
ISBN = "979-8-89176-256-5",
abstract = "Analysing texts spanning long periods of time is critical for researchers in historical linguistics and related disciplines. However, publicly available corpora suitable for such analyses are scarce. The Project Gutenberg (PG) corpus presents a significant yet underutilized opportunity in this context, due to the absence of accurate temporal metadata. We take advantage of language models and information retrieval to explore four sources of information {--} Open Web, Wikipedia, Open Library API, and PG books texts {--} to add missing temporal metadata to the PG corpus. Through 20 experiments employing state-of-the-art Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) methods, we estimate the production years of all PG books. We curate an enriched metadata repository for the PG corpus and propose a refined version for it, which includes 53,774 books with a total of 3.8 billion tokens in 11 languages, produced between 1600 and 2000. This work provides a new resource for computational linguistics and humanities studies focusing on diachronic analyses. The final dataset and all experiments data are publicly available (https://github.com/OmarMomen14/pg-dates)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="momen-etal-2025-filling">
<titleInfo>
<title>Filling the Temporal Void: Recovering Missing Publication Years in the Project Gutenberg Corpus Using LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Omar</namePart>
<namePart type="family">Momen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manuel</namePart>
<namePart type="family">Schaaf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexander</namePart>
<namePart type="family">Mehler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Analysing texts spanning long periods of time is critical for researchers in historical linguistics and related disciplines. However, publicly available corpora suitable for such analyses are scarce. The Project Gutenberg (PG) corpus presents a significant yet underutilized opportunity in this context, due to the absence of accurate temporal metadata. We take advantage of language models and information retrieval to explore four sources of information – Open Web, Wikipedia, Open Library API, and PG books texts – to add missing temporal metadata to the PG corpus. Through 20 experiments employing state-of-the-art Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) methods, we estimate the production years of all PG books. We curate an enriched metadata repository for the PG corpus and propose a refined version for it, which includes 53,774 books with a total of 3.8 billion tokens in 11 languages, produced between 1600 and 2000. This work provides a new resource for computational linguistics and humanities studies focusing on diachronic analyses. The final dataset and all experiments data are publicly available (https://github.com/OmarMomen14/pg-dates).</abstract>
<identifier type="citekey">momen-etal-2025-filling</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.890</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.890/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>17318</start>
<end>17334</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Filling the Temporal Void: Recovering Missing Publication Years in the Project Gutenberg Corpus Using LLMs
%A Momen, Omar
%A Schaaf, Manuel
%A Mehler, Alexander
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F momen-etal-2025-filling
%X Analysing texts spanning long periods of time is critical for researchers in historical linguistics and related disciplines. However, publicly available corpora suitable for such analyses are scarce. The Project Gutenberg (PG) corpus presents a significant yet underutilized opportunity in this context, due to the absence of accurate temporal metadata. We take advantage of language models and information retrieval to explore four sources of information – Open Web, Wikipedia, Open Library API, and PG books texts – to add missing temporal metadata to the PG corpus. Through 20 experiments employing state-of-the-art Large Language Models (LLMs) and Retrieval-Augmented Generation (RAG) methods, we estimate the production years of all PG books. We curate an enriched metadata repository for the PG corpus and propose a refined version for it, which includes 53,774 books with a total of 3.8 billion tokens in 11 languages, produced between 1600 and 2000. This work provides a new resource for computational linguistics and humanities studies focusing on diachronic analyses. The final dataset and all experiments data are publicly available (https://github.com/OmarMomen14/pg-dates).
%R 10.18653/v1/2025.findings-acl.890
%U https://aclanthology.org/2025.findings-acl.890/
%U https://doi.org/10.18653/v1/2025.findings-acl.890
%P 17318-17334
Markdown (Informal)
[Filling the Temporal Void: Recovering Missing Publication Years in the Project Gutenberg Corpus Using LLMs](https://aclanthology.org/2025.findings-acl.890/) (Momen et al., Findings 2025)
ACL