@inproceedings{shrestha-etal-2026-hemolix,
title = "{H}emolix.{T}ab{G}en: Optimized Table Generation from Documents",
author = "Shrestha, Gyanendra and
Ivanov, Todor and
Vemireddy, Karthik and
Pyayt, Anna and
Gubanov, Michael",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.73/",
pages = "1055--1066",
ISBN = "979-8-89176-394-4",
abstract = "Modern Data Lakes contain vast and heterogeneous document collections, making table generation from documents a persistent and nontrivial challenge. Traditional approaches are often rigid {---} i.e. domain-specific, require extensive supervision, or are limited to set of pre-defined schemas; LLM-based approaches are more flexible, but typically suffer from hallucinations, non-determinism, and high computational costs. To overcome these limitations, we introduce Hemolix.TabGen, a novel scalable LLM-based table generation systemthat comprehends documents and generates Bi-dimensional tables based on the entire document content. We evaluated TabGen on 4 publicly available datasets spanning multiple domains and observed an Average Precision delta up to 30{\%} compared to vanilla LLMs"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shrestha-etal-2026-hemolix">
<titleInfo>
<title>Hemolix.TabGen: Optimized Table Generation from Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gyanendra</namePart>
<namePart type="family">Shrestha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Todor</namePart>
<namePart type="family">Ivanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karthik</namePart>
<namePart type="family">Vemireddy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Pyayt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Gubanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>Modern Data Lakes contain vast and heterogeneous document collections, making table generation from documents a persistent and nontrivial challenge. Traditional approaches are often rigid — i.e. domain-specific, require extensive supervision, or are limited to set of pre-defined schemas; LLM-based approaches are more flexible, but typically suffer from hallucinations, non-determinism, and high computational costs. To overcome these limitations, we introduce Hemolix.TabGen, a novel scalable LLM-based table generation systemthat comprehends documents and generates Bi-dimensional tables based on the entire document content. We evaluated TabGen on 4 publicly available datasets spanning multiple domains and observed an Average Precision delta up to 30% compared to vanilla LLMs</abstract>
<identifier type="citekey">shrestha-etal-2026-hemolix</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.73/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1055</start>
<end>1066</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Hemolix.TabGen: Optimized Table Generation from Documents
%A Shrestha, Gyanendra
%A Ivanov, Todor
%A Vemireddy, Karthik
%A Pyayt, Anna
%A Gubanov, Michael
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F shrestha-etal-2026-hemolix
%X Modern Data Lakes contain vast and heterogeneous document collections, making table generation from documents a persistent and nontrivial challenge. Traditional approaches are often rigid — i.e. domain-specific, require extensive supervision, or are limited to set of pre-defined schemas; LLM-based approaches are more flexible, but typically suffer from hallucinations, non-determinism, and high computational costs. To overcome these limitations, we introduce Hemolix.TabGen, a novel scalable LLM-based table generation systemthat comprehends documents and generates Bi-dimensional tables based on the entire document content. We evaluated TabGen on 4 publicly available datasets spanning multiple domains and observed an Average Precision delta up to 30% compared to vanilla LLMs
%U https://aclanthology.org/2026.acl-industry.73/
%P 1055-1066
Markdown (Informal)
[Hemolix.TabGen: Optimized Table Generation from Documents](https://aclanthology.org/2026.acl-industry.73/) (Shrestha et al., ACL 2026)
ACL
- Gyanendra Shrestha, Todor Ivanov, Karthik Vemireddy, Anna Pyayt, and Michael Gubanov. 2026. Hemolix.TabGen: Optimized Table Generation from Documents. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 1055–1066, San Diego, California, USA. Association for Computational Linguistics.