@inproceedings{fellman-etal-2024-future,
title = "The Future of Web Data Mining: Insights from Multimodal and Code-based Extraction Methods",
author = "Fellman, Evan and
Tyo, Jacob and
Lipton, Zachary",
editor = {H{\"u}rriyeto{\u{g}}lu, Ali and
Tanev, Hristo and
Thapa, Surendrabikram and
Uludo{\u{g}}an, G{\"o}k{\c{c}}e},
booktitle = "Proceedings of the 7th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE 2024)",
month = mar,
year = "2024",
address = "St. Julians, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.case-1.1",
pages = "1--5",
abstract = "The extraction of structured data from websites is critical for numerous Artificial Intelligence applications, but modern web design increasingly stores information visually in images rather than in text. This shift calls into question the optimal technique, as language-only models fail without textual cues while new multimodal models like GPT-4 promise image understanding abilities. We conduct the first rigorous comparison between text-based and vision-based models for extracting event metadata harvested from comic convention websites. Surprisingly, our results between GPT-4 Vision and GPT-4 Text uncover a significant accuracy advantage for vision-based methods in an applies-to-apples setting, indicating that vision models may be outpacing language-alone techniques in the task of information extraction from websites. We release our dataset and provide a qualitative analysis to guide further research in multi-modal models for web information extraction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fellman-etal-2024-future">
<titleInfo>
<title>The Future of Web Data Mining: Insights from Multimodal and Code-based Extraction Methods</title>
</titleInfo>
<name type="personal">
<namePart type="given">Evan</namePart>
<namePart type="family">Fellman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Tyo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zachary</namePart>
<namePart type="family">Lipton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="family">Hürriyetoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hristo</namePart>
<namePart type="family">Tanev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gökçe</namePart>
<namePart type="family">Uludoğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julians, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The extraction of structured data from websites is critical for numerous Artificial Intelligence applications, but modern web design increasingly stores information visually in images rather than in text. This shift calls into question the optimal technique, as language-only models fail without textual cues while new multimodal models like GPT-4 promise image understanding abilities. We conduct the first rigorous comparison between text-based and vision-based models for extracting event metadata harvested from comic convention websites. Surprisingly, our results between GPT-4 Vision and GPT-4 Text uncover a significant accuracy advantage for vision-based methods in an applies-to-apples setting, indicating that vision models may be outpacing language-alone techniques in the task of information extraction from websites. We release our dataset and provide a qualitative analysis to guide further research in multi-modal models for web information extraction.</abstract>
<identifier type="citekey">fellman-etal-2024-future</identifier>
<location>
<url>https://aclanthology.org/2024.case-1.1</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>1</start>
<end>5</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Future of Web Data Mining: Insights from Multimodal and Code-based Extraction Methods
%A Fellman, Evan
%A Tyo, Jacob
%A Lipton, Zachary
%Y Hürriyetoğlu, Ali
%Y Tanev, Hristo
%Y Thapa, Surendrabikram
%Y Uludoğan, Gökçe
%S Proceedings of the 7th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE 2024)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julians, Malta
%F fellman-etal-2024-future
%X The extraction of structured data from websites is critical for numerous Artificial Intelligence applications, but modern web design increasingly stores information visually in images rather than in text. This shift calls into question the optimal technique, as language-only models fail without textual cues while new multimodal models like GPT-4 promise image understanding abilities. We conduct the first rigorous comparison between text-based and vision-based models for extracting event metadata harvested from comic convention websites. Surprisingly, our results between GPT-4 Vision and GPT-4 Text uncover a significant accuracy advantage for vision-based methods in an applies-to-apples setting, indicating that vision models may be outpacing language-alone techniques in the task of information extraction from websites. We release our dataset and provide a qualitative analysis to guide further research in multi-modal models for web information extraction.
%U https://aclanthology.org/2024.case-1.1
%P 1-5
Markdown (Informal)
[The Future of Web Data Mining: Insights from Multimodal and Code-based Extraction Methods](https://aclanthology.org/2024.case-1.1) (Fellman et al., CASE-WS 2024)
ACL