@inproceedings{haidar-etal-2026-back,
title = "Back-of-the-Book Index Automation for {A}rabic Documents",
author = "Haidar, Nawal and
Kashmar, Ahmad and
Zaraket, Fadi",
booktitle = "Proceedings of the 2nd Workshop on {NLP} for Languages Using {A}rabic Script",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.abjadnlp-1.29/",
pages = "208--217",
abstract = "Back-of-the-book indexes (BoBIs) are crucial for book readability. However, their manual creation is laborious and error prone. In this paper, we introduce ArBoBIM to automate BoBI extraction and review processes for Arabic books. Given a book with a corresponding BoBI, ArBoBIM extracts BoBI terms and identifies their occurrences and aligns those across several versions of the book. ArBoBIM first defines a pool of candidates for each term by leveraging noun phrases and named entities. ArBoBIM leverages several metrics, including exact matches, morpho-lexical similarity, and semantic similarity, to determine the best candidates. We empirically fine-tuned thresholds for ArBoBIM and achieve an F1-score of 0.94 (precision= 0.97, recall=0.91). These results are significantly better than baseline results, and top LLM based results with lower computational cost and no publishing house IP risks. Additionally, with ArBoBIM, over 500 books have been processed, resulting in the ArBoBIMap dataset, containing books, their terms, occurrences, and various metadata related to them, to be made available for the public. This dataset is used to train a model to identify if a term, given its features, should be added to the back-of-the-book index of a specific book. The model achieves an F1-score of 0.91 (precision = 0.97, recall = 0.85)."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="haidar-etal-2026-back">
<titleInfo>
<title>Back-of-the-Book Index Automation for Arabic Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nawal</namePart>
<namePart type="family">Haidar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Kashmar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fadi</namePart>
<namePart type="family">Zaraket</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Back-of-the-book indexes (BoBIs) are crucial for book readability. However, their manual creation is laborious and error prone. In this paper, we introduce ArBoBIM to automate BoBI extraction and review processes for Arabic books. Given a book with a corresponding BoBI, ArBoBIM extracts BoBI terms and identifies their occurrences and aligns those across several versions of the book. ArBoBIM first defines a pool of candidates for each term by leveraging noun phrases and named entities. ArBoBIM leverages several metrics, including exact matches, morpho-lexical similarity, and semantic similarity, to determine the best candidates. We empirically fine-tuned thresholds for ArBoBIM and achieve an F1-score of 0.94 (precision= 0.97, recall=0.91). These results are significantly better than baseline results, and top LLM based results with lower computational cost and no publishing house IP risks. Additionally, with ArBoBIM, over 500 books have been processed, resulting in the ArBoBIMap dataset, containing books, their terms, occurrences, and various metadata related to them, to be made available for the public. This dataset is used to train a model to identify if a term, given its features, should be added to the back-of-the-book index of a specific book. The model achieves an F1-score of 0.91 (precision = 0.97, recall = 0.85).</abstract>
<identifier type="citekey">haidar-etal-2026-back</identifier>
<location>
<url>https://aclanthology.org/2026.abjadnlp-1.29/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>208</start>
<end>217</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Back-of-the-Book Index Automation for Arabic Documents
%A Haidar, Nawal
%A Kashmar, Ahmad
%A Zaraket, Fadi
%S Proceedings of the 2nd Workshop on NLP for Languages Using Arabic Script
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%F haidar-etal-2026-back
%X Back-of-the-book indexes (BoBIs) are crucial for book readability. However, their manual creation is laborious and error prone. In this paper, we introduce ArBoBIM to automate BoBI extraction and review processes for Arabic books. Given a book with a corresponding BoBI, ArBoBIM extracts BoBI terms and identifies their occurrences and aligns those across several versions of the book. ArBoBIM first defines a pool of candidates for each term by leveraging noun phrases and named entities. ArBoBIM leverages several metrics, including exact matches, morpho-lexical similarity, and semantic similarity, to determine the best candidates. We empirically fine-tuned thresholds for ArBoBIM and achieve an F1-score of 0.94 (precision= 0.97, recall=0.91). These results are significantly better than baseline results, and top LLM based results with lower computational cost and no publishing house IP risks. Additionally, with ArBoBIM, over 500 books have been processed, resulting in the ArBoBIMap dataset, containing books, their terms, occurrences, and various metadata related to them, to be made available for the public. This dataset is used to train a model to identify if a term, given its features, should be added to the back-of-the-book index of a specific book. The model achieves an F1-score of 0.91 (precision = 0.97, recall = 0.85).
%U https://aclanthology.org/2026.abjadnlp-1.29/
%P 208-217
Markdown (Informal)
[Back-of-the-Book Index Automation for Arabic Documents](https://aclanthology.org/2026.abjadnlp-1.29/) (Haidar et al., AbjadNLP 2026)
ACL