@inproceedings{abbas-etal-2025-structured,
title = "Structured Tender Entities Extraction from Complex Tables with Few-short Learning",
author = "Abbas, Asim and
Lee, Mark and
Shanavas, Niloofer and
Kovatchev, Venelin and
Ali, Mubashir",
editor = "Gokhan, Tuba and
Wang, Kexin and
Gurevych, Iryna and
Briscoe, Ted",
booktitle = "Proceedings of the 1st Regulatory NLP Workshop (RegNLP 2025)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.regnlp-1.9/",
pages = "59--67",
abstract = "Extracting structured text from complex tables in PDF tender documents remains a challenging task due to the loss of structural and positional information during the extraction process. AI-based models often require extensive training data, making development from scratch both tedious and time-consuming. Our research focuses on identifying tender entities in complex table formats within PDF documents. To address this, we propose a novel approach utilizing few-shot learning with large language models (LLMs) to restore the structure of extracted text. Additionally, handcrafted rules and regular expressions are employed for precise entity classification. To evaluate the robustness of LLMs with few-shot learning, we employ data-shuffling techniques. Our experiments show that current text extraction tools fail to deliver satisfactory results for complex table structures. However, the few-shot learning approach significantly enhances the structural integrity of extracted data and improves the accuracy of tender entity identification."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abbas-etal-2025-structured">
<titleInfo>
<title>Structured Tender Entities Extraction from Complex Tables with Few-short Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Asim</namePart>
<namePart type="family">Abbas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niloofer</namePart>
<namePart type="family">Shanavas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Venelin</namePart>
<namePart type="family">Kovatchev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mubashir</namePart>
<namePart type="family">Ali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Regulatory NLP Workshop (RegNLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tuba</namePart>
<namePart type="family">Gokhan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kexin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iryna</namePart>
<namePart type="family">Gurevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ted</namePart>
<namePart type="family">Briscoe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extracting structured text from complex tables in PDF tender documents remains a challenging task due to the loss of structural and positional information during the extraction process. AI-based models often require extensive training data, making development from scratch both tedious and time-consuming. Our research focuses on identifying tender entities in complex table formats within PDF documents. To address this, we propose a novel approach utilizing few-shot learning with large language models (LLMs) to restore the structure of extracted text. Additionally, handcrafted rules and regular expressions are employed for precise entity classification. To evaluate the robustness of LLMs with few-shot learning, we employ data-shuffling techniques. Our experiments show that current text extraction tools fail to deliver satisfactory results for complex table structures. However, the few-shot learning approach significantly enhances the structural integrity of extracted data and improves the accuracy of tender entity identification.</abstract>
<identifier type="citekey">abbas-etal-2025-structured</identifier>
<location>
<url>https://aclanthology.org/2025.regnlp-1.9/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>59</start>
<end>67</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Structured Tender Entities Extraction from Complex Tables with Few-short Learning
%A Abbas, Asim
%A Lee, Mark
%A Shanavas, Niloofer
%A Kovatchev, Venelin
%A Ali, Mubashir
%Y Gokhan, Tuba
%Y Wang, Kexin
%Y Gurevych, Iryna
%Y Briscoe, Ted
%S Proceedings of the 1st Regulatory NLP Workshop (RegNLP 2025)
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F abbas-etal-2025-structured
%X Extracting structured text from complex tables in PDF tender documents remains a challenging task due to the loss of structural and positional information during the extraction process. AI-based models often require extensive training data, making development from scratch both tedious and time-consuming. Our research focuses on identifying tender entities in complex table formats within PDF documents. To address this, we propose a novel approach utilizing few-shot learning with large language models (LLMs) to restore the structure of extracted text. Additionally, handcrafted rules and regular expressions are employed for precise entity classification. To evaluate the robustness of LLMs with few-shot learning, we employ data-shuffling techniques. Our experiments show that current text extraction tools fail to deliver satisfactory results for complex table structures. However, the few-shot learning approach significantly enhances the structural integrity of extracted data and improves the accuracy of tender entity identification.
%U https://aclanthology.org/2025.regnlp-1.9/
%P 59-67
Markdown (Informal)
[Structured Tender Entities Extraction from Complex Tables with Few-short Learning](https://aclanthology.org/2025.regnlp-1.9/) (Abbas et al., RegNLP 2025)
ACL