@inproceedings{khalighinejad-etal-2025-matvix,
title = "{M}at{V}i{X}: Multimodal Information Extraction from Visually Rich Articles",
author = "Khalighinejad, Ghazal and
Scott, Sharon and
Liu, Ollie and
Anderson, Kelly L. and
Stureborg, Rickard and
Tyagi, Aman and
Dhingra, Bhuwan",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.185/",
doi = "10.18653/v1/2025.naacl-long.185",
pages = "3636--3655",
ISBN = "979-8-89176-189-6",
abstract = "Multimodal information extraction (MIE) is crucial for scientific literature, where valuable data is often spread across text, figures, and tables. In materials science, extracting structured information from research articles can accelerate the discovery of new materials. However, the multimodal nature and complex interconnections of scientific content present challenges for traditional text-based methods. We introduce MatViX, a benchmark consisting of 324 full-length research articles and 1,688 complex structured JSON files, carefully curated by domain experts in polymer nanocomposites and biodegradation. These JSON files are extracted from text, tables, and figures in full-length documents, providing a comprehensive challenge for MIE. We introduce a novel evaluation method to assess the accuracy of curve similarity and the alignment of hierarchical structures. Additionally, we benchmark vision-language models (VLMs) in a zero-shot manner, capable of processing long contexts and multimodal inputs. Our results demonstrate significant room for improvement in current models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="khalighinejad-etal-2025-matvix">
<titleInfo>
<title>MatViX: Multimodal Information Extraction from Visually Rich Articles</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ghazal</namePart>
<namePart type="family">Khalighinejad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharon</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ollie</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="given">L</namePart>
<namePart type="family">Anderson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rickard</namePart>
<namePart type="family">Stureborg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aman</namePart>
<namePart type="family">Tyagi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhuwan</namePart>
<namePart type="family">Dhingra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Multimodal information extraction (MIE) is crucial for scientific literature, where valuable data is often spread across text, figures, and tables. In materials science, extracting structured information from research articles can accelerate the discovery of new materials. However, the multimodal nature and complex interconnections of scientific content present challenges for traditional text-based methods. We introduce MatViX, a benchmark consisting of 324 full-length research articles and 1,688 complex structured JSON files, carefully curated by domain experts in polymer nanocomposites and biodegradation. These JSON files are extracted from text, tables, and figures in full-length documents, providing a comprehensive challenge for MIE. We introduce a novel evaluation method to assess the accuracy of curve similarity and the alignment of hierarchical structures. Additionally, we benchmark vision-language models (VLMs) in a zero-shot manner, capable of processing long contexts and multimodal inputs. Our results demonstrate significant room for improvement in current models.</abstract>
<identifier type="citekey">khalighinejad-etal-2025-matvix</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.185</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.185/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>3636</start>
<end>3655</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MatViX: Multimodal Information Extraction from Visually Rich Articles
%A Khalighinejad, Ghazal
%A Scott, Sharon
%A Liu, Ollie
%A Anderson, Kelly L.
%A Stureborg, Rickard
%A Tyagi, Aman
%A Dhingra, Bhuwan
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F khalighinejad-etal-2025-matvix
%X Multimodal information extraction (MIE) is crucial for scientific literature, where valuable data is often spread across text, figures, and tables. In materials science, extracting structured information from research articles can accelerate the discovery of new materials. However, the multimodal nature and complex interconnections of scientific content present challenges for traditional text-based methods. We introduce MatViX, a benchmark consisting of 324 full-length research articles and 1,688 complex structured JSON files, carefully curated by domain experts in polymer nanocomposites and biodegradation. These JSON files are extracted from text, tables, and figures in full-length documents, providing a comprehensive challenge for MIE. We introduce a novel evaluation method to assess the accuracy of curve similarity and the alignment of hierarchical structures. Additionally, we benchmark vision-language models (VLMs) in a zero-shot manner, capable of processing long contexts and multimodal inputs. Our results demonstrate significant room for improvement in current models.
%R 10.18653/v1/2025.naacl-long.185
%U https://aclanthology.org/2025.naacl-long.185/
%U https://doi.org/10.18653/v1/2025.naacl-long.185
%P 3636-3655
Markdown (Informal)
[MatViX: Multimodal Information Extraction from Visually Rich Articles](https://aclanthology.org/2025.naacl-long.185/) (Khalighinejad et al., NAACL 2025)
ACL
- Ghazal Khalighinejad, Sharon Scott, Ollie Liu, Kelly L. Anderson, Rickard Stureborg, Aman Tyagi, and Bhuwan Dhingra. 2025. MatViX: Multimodal Information Extraction from Visually Rich Articles. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 3636–3655, Albuquerque, New Mexico. Association for Computational Linguistics.