@inproceedings{nowak-etal-2024-multimodal,
title = "Multimodal Chart Retrieval: A Comparison of Text, Table and Image Based Approaches",
author = "Nowak, Averi and
Piccinno, Francesco and
Altun, Yasemin",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.naacl-long.307",
pages = "5488--5505",
abstract = "We investigate multimodal chart retrieval, addressing the challenge of retrieving image-based charts using textual queries. We compare four approaches: (a) OCR with text retrieval, (b) chart derendering (DePlot) followed by table retrieval, (c) a direct image understanding model (PaLI-3), and (d) a combined PaLI-3 + DePlot approach. As the table retrieval component we introduce Tab-GTR, a text retrieval model augmented with table structure embeddings, achieving state-of-the-art results on the NQ-Tables benchmark with 48.88{\%} R@1. On in-distribution data, the DePlot-based method (b) outperforms PaLI-3 (c), while being significantly more efficient (300M vs 3B trainable parameters). However, DePlot struggles with complex charts, indicating a need for improvements in chart derendering - specifically in terms of chart data diversity and the richness of text/table representations. We found no clear winner between methods (b) and (c) in general, with the best performance achieved by the combined approach (d), and further show that it benefits the most from multi-task training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nowak-etal-2024-multimodal">
<titleInfo>
<title>Multimodal Chart Retrieval: A Comparison of Text, Table and Image Based Approaches</title>
</titleInfo>
<name type="personal">
<namePart type="given">Averi</namePart>
<namePart type="family">Nowak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Piccinno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yasemin</namePart>
<namePart type="family">Altun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="family">Duh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helena</namePart>
<namePart type="family">Gomez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We investigate multimodal chart retrieval, addressing the challenge of retrieving image-based charts using textual queries. We compare four approaches: (a) OCR with text retrieval, (b) chart derendering (DePlot) followed by table retrieval, (c) a direct image understanding model (PaLI-3), and (d) a combined PaLI-3 + DePlot approach. As the table retrieval component we introduce Tab-GTR, a text retrieval model augmented with table structure embeddings, achieving state-of-the-art results on the NQ-Tables benchmark with 48.88% R@1. On in-distribution data, the DePlot-based method (b) outperforms PaLI-3 (c), while being significantly more efficient (300M vs 3B trainable parameters). However, DePlot struggles with complex charts, indicating a need for improvements in chart derendering - specifically in terms of chart data diversity and the richness of text/table representations. We found no clear winner between methods (b) and (c) in general, with the best performance achieved by the combined approach (d), and further show that it benefits the most from multi-task training.</abstract>
<identifier type="citekey">nowak-etal-2024-multimodal</identifier>
<location>
<url>https://aclanthology.org/2024.naacl-long.307</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>5488</start>
<end>5505</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multimodal Chart Retrieval: A Comparison of Text, Table and Image Based Approaches
%A Nowak, Averi
%A Piccinno, Francesco
%A Altun, Yasemin
%Y Duh, Kevin
%Y Gomez, Helena
%Y Bethard, Steven
%S Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F nowak-etal-2024-multimodal
%X We investigate multimodal chart retrieval, addressing the challenge of retrieving image-based charts using textual queries. We compare four approaches: (a) OCR with text retrieval, (b) chart derendering (DePlot) followed by table retrieval, (c) a direct image understanding model (PaLI-3), and (d) a combined PaLI-3 + DePlot approach. As the table retrieval component we introduce Tab-GTR, a text retrieval model augmented with table structure embeddings, achieving state-of-the-art results on the NQ-Tables benchmark with 48.88% R@1. On in-distribution data, the DePlot-based method (b) outperforms PaLI-3 (c), while being significantly more efficient (300M vs 3B trainable parameters). However, DePlot struggles with complex charts, indicating a need for improvements in chart derendering - specifically in terms of chart data diversity and the richness of text/table representations. We found no clear winner between methods (b) and (c) in general, with the best performance achieved by the combined approach (d), and further show that it benefits the most from multi-task training.
%U https://aclanthology.org/2024.naacl-long.307
%P 5488-5505
Markdown (Informal)
[Multimodal Chart Retrieval: A Comparison of Text, Table and Image Based Approaches](https://aclanthology.org/2024.naacl-long.307) (Nowak et al., NAACL 2024)
ACL