@inproceedings{heidenreich-etal-2025-page,
title = "Page Stream Segmentation with {LLM}s: Challenges and Applications in Insurance Document Automation",
author = "Heidenreich, Hunter and
Dalvi, Ratish and
Verma, Nikhil and
Getachew, Yosheb",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven and
Darwish, Kareem and
Agarwal, Apoorv",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics: Industry Track",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-industry.26/",
pages = "305--317",
abstract = "Page Stream Segmentation (PSS) is critical for automating document processing in industries like insurance, where unstructured document collections are common. This paper explores the use of large language models (LLMs) for PSS, applying parameter-efficient fine-tuning to real-world insurance data. Our experiments show that LLMs outperform baseline models in page- and stream-level segmentation accuracy. However, stream-level calibration remains challenging, especially for high-stakes applications. We evaluate post-hoc calibration and Monte Carlo dropout, finding limited improvement. Future work will integrate active learning to enhance model calibration and support deployment in practical settings."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="heidenreich-etal-2025-page">
<titleInfo>
<title>Page Stream Segmentation with LLMs: Challenges and Applications in Insurance Document Automation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hunter</namePart>
<namePart type="family">Heidenreich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ratish</namePart>
<namePart type="family">Dalvi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikhil</namePart>
<namePart type="family">Verma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yosheb</namePart>
<namePart type="family">Getachew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Apoorv</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Page Stream Segmentation (PSS) is critical for automating document processing in industries like insurance, where unstructured document collections are common. This paper explores the use of large language models (LLMs) for PSS, applying parameter-efficient fine-tuning to real-world insurance data. Our experiments show that LLMs outperform baseline models in page- and stream-level segmentation accuracy. However, stream-level calibration remains challenging, especially for high-stakes applications. We evaluate post-hoc calibration and Monte Carlo dropout, finding limited improvement. Future work will integrate active learning to enhance model calibration and support deployment in practical settings.</abstract>
<identifier type="citekey">heidenreich-etal-2025-page</identifier>
<location>
<url>https://aclanthology.org/2025.coling-industry.26/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>305</start>
<end>317</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Page Stream Segmentation with LLMs: Challenges and Applications in Insurance Document Automation
%A Heidenreich, Hunter
%A Dalvi, Ratish
%A Verma, Nikhil
%A Getachew, Yosheb
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%Y Darwish, Kareem
%Y Agarwal, Apoorv
%S Proceedings of the 31st International Conference on Computational Linguistics: Industry Track
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F heidenreich-etal-2025-page
%X Page Stream Segmentation (PSS) is critical for automating document processing in industries like insurance, where unstructured document collections are common. This paper explores the use of large language models (LLMs) for PSS, applying parameter-efficient fine-tuning to real-world insurance data. Our experiments show that LLMs outperform baseline models in page- and stream-level segmentation accuracy. However, stream-level calibration remains challenging, especially for high-stakes applications. We evaluate post-hoc calibration and Monte Carlo dropout, finding limited improvement. Future work will integrate active learning to enhance model calibration and support deployment in practical settings.
%U https://aclanthology.org/2025.coling-industry.26/
%P 305-317
Markdown (Informal)
[Page Stream Segmentation with LLMs: Challenges and Applications in Insurance Document Automation](https://aclanthology.org/2025.coling-industry.26/) (Heidenreich et al., COLING 2025)
ACL