@inproceedings{yang-etal-2025-enhanced,
title = "Enhanced Table Structure Recognition with Multi-Modal Approach",
author = "Yang, Huichen and
Hellicar, Andrew D. and
Rybinski, Maciej and
Karimi, Sarvnaz",
editor = "Accomazzi, Alberto and
Ghosal, Tirthankar and
Grezes, Felix and
Lockhart, Kelly",
booktitle = "Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications",
month = dec,
year = "2025",
address = "Mumbai, India and virtual",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.wasp-main.23/",
pages = "201--207",
ISBN = "979-8-89176-310-4",
abstract = "Tables are fundamental for presenting information in research articles, technical documents, manuals, and reports. One key challenge is accessing the information in tables that are embedded in Portable Document Format (PDF) files or scanned images. It requires accurately recognising table structures in diverse table layouts and complex tables. The Table Structure Recognition (TSR) task aims to recognise the internal structure of table images and convert them into a machine-readable format. We propose a flexible multi-modal framework for image-based TSR. Our approach employs two-stream transformer encoders alongside task-specific decoders for table structure extraction and cell bounding box detection. Experiments on benchmark datasets demonstrate that our model achieves highly competitive results compared to strong baselines, gaining 5.4{\%} over single-modality approaches on the FinTabNetd dataset."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2025-enhanced">
<titleInfo>
<title>Enhanced Table Structure Recognition with Multi-Modal Approach</title>
</titleInfo>
<name type="personal">
<namePart type="given">Huichen</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrew</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Hellicar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Rybinski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarvnaz</namePart>
<namePart type="family">Karimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Accomazzi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tirthankar</namePart>
<namePart type="family">Ghosal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="family">Grezes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Lockhart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India and virtual</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-310-4</identifier>
</relatedItem>
<abstract>Tables are fundamental for presenting information in research articles, technical documents, manuals, and reports. One key challenge is accessing the information in tables that are embedded in Portable Document Format (PDF) files or scanned images. It requires accurately recognising table structures in diverse table layouts and complex tables. The Table Structure Recognition (TSR) task aims to recognise the internal structure of table images and convert them into a machine-readable format. We propose a flexible multi-modal framework for image-based TSR. Our approach employs two-stream transformer encoders alongside task-specific decoders for table structure extraction and cell bounding box detection. Experiments on benchmark datasets demonstrate that our model achieves highly competitive results compared to strong baselines, gaining 5.4% over single-modality approaches on the FinTabNetd dataset.</abstract>
<identifier type="citekey">yang-etal-2025-enhanced</identifier>
<location>
<url>https://aclanthology.org/2025.wasp-main.23/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>201</start>
<end>207</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhanced Table Structure Recognition with Multi-Modal Approach
%A Yang, Huichen
%A Hellicar, Andrew D.
%A Rybinski, Maciej
%A Karimi, Sarvnaz
%Y Accomazzi, Alberto
%Y Ghosal, Tirthankar
%Y Grezes, Felix
%Y Lockhart, Kelly
%S Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India and virtual
%@ 979-8-89176-310-4
%F yang-etal-2025-enhanced
%X Tables are fundamental for presenting information in research articles, technical documents, manuals, and reports. One key challenge is accessing the information in tables that are embedded in Portable Document Format (PDF) files or scanned images. It requires accurately recognising table structures in diverse table layouts and complex tables. The Table Structure Recognition (TSR) task aims to recognise the internal structure of table images and convert them into a machine-readable format. We propose a flexible multi-modal framework for image-based TSR. Our approach employs two-stream transformer encoders alongside task-specific decoders for table structure extraction and cell bounding box detection. Experiments on benchmark datasets demonstrate that our model achieves highly competitive results compared to strong baselines, gaining 5.4% over single-modality approaches on the FinTabNetd dataset.
%U https://aclanthology.org/2025.wasp-main.23/
%P 201-207
Markdown (Informal)
[Enhanced Table Structure Recognition with Multi-Modal Approach](https://aclanthology.org/2025.wasp-main.23/) (Yang et al., WASP 2025)
ACL
- Huichen Yang, Andrew D. Hellicar, Maciej Rybinski, and Sarvnaz Karimi. 2025. Enhanced Table Structure Recognition with Multi-Modal Approach. In Proceedings of the Third Workshop for Artificial Intelligence for Scientific Publications, pages 201–207, Mumbai, India and virtual. Association for Computational Linguistics.