@inproceedings{hatty-etal-2024-cost,
title = "A Cost-Efficient Modular Sieve for Extracting Product Information from Company Websites",
author = {H{\"a}tty, Anna and
Milchevski, Dragan and
D{\"o}ring, Kersten and
Putnikovic, Marko and
Mesgar, Mohsen and
Novovi{\'c}, Filip and
Braun, Maximilian and
Borimann, Karina Leoni and
Stranjanac, Igor},
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.106",
pages = "1444--1456",
abstract = "Extracting product information is crucial for informed business decisions and strategic planning across multiple industries. However, recent methods relying only on large language models (LLMs) are resource-intensive and computationally prohibitive due to website structure differences and numerous non-product pages. To address these challenges, we propose a novel modular method that leverages low-cost classification models to filter out company web pages, significantly reducing computational costs. Our approach consists of three modules: web page crawling, product page classification using efficient machine learning models, and product information extraction using LLMs on classified product pages. We evaluate our method on a new dataset of about 7000 product and non-product web pages, achieving a 6-point improvement in F1-score, 95{\%} reduction in computational time, and 87.5{\%} reduction in cost compared to end-to-end LLMs. Our research demonstrates the effectiveness of our proposed low-cost classification module to identify web pages containing product information, making product information extraction more effective and cost-efficient.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hatty-etal-2024-cost">
<titleInfo>
<title>A Cost-Efficient Modular Sieve for Extracting Product Information from Company Websites</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Hätty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dragan</namePart>
<namePart type="family">Milchevski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kersten</namePart>
<namePart type="family">Döring</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marko</namePart>
<namePart type="family">Putnikovic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohsen</namePart>
<namePart type="family">Mesgar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Filip</namePart>
<namePart type="family">Novović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maximilian</namePart>
<namePart type="family">Braun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karina</namePart>
<namePart type="given">Leoni</namePart>
<namePart type="family">Borimann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Igor</namePart>
<namePart type="family">Stranjanac</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extracting product information is crucial for informed business decisions and strategic planning across multiple industries. However, recent methods relying only on large language models (LLMs) are resource-intensive and computationally prohibitive due to website structure differences and numerous non-product pages. To address these challenges, we propose a novel modular method that leverages low-cost classification models to filter out company web pages, significantly reducing computational costs. Our approach consists of three modules: web page crawling, product page classification using efficient machine learning models, and product information extraction using LLMs on classified product pages. We evaluate our method on a new dataset of about 7000 product and non-product web pages, achieving a 6-point improvement in F1-score, 95% reduction in computational time, and 87.5% reduction in cost compared to end-to-end LLMs. Our research demonstrates the effectiveness of our proposed low-cost classification module to identify web pages containing product information, making product information extraction more effective and cost-efficient.</abstract>
<identifier type="citekey">hatty-etal-2024-cost</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.106</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1444</start>
<end>1456</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Cost-Efficient Modular Sieve for Extracting Product Information from Company Websites
%A Hätty, Anna
%A Milchevski, Dragan
%A Döring, Kersten
%A Putnikovic, Marko
%A Mesgar, Mohsen
%A Novović, Filip
%A Braun, Maximilian
%A Borimann, Karina Leoni
%A Stranjanac, Igor
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F hatty-etal-2024-cost
%X Extracting product information is crucial for informed business decisions and strategic planning across multiple industries. However, recent methods relying only on large language models (LLMs) are resource-intensive and computationally prohibitive due to website structure differences and numerous non-product pages. To address these challenges, we propose a novel modular method that leverages low-cost classification models to filter out company web pages, significantly reducing computational costs. Our approach consists of three modules: web page crawling, product page classification using efficient machine learning models, and product information extraction using LLMs on classified product pages. We evaluate our method on a new dataset of about 7000 product and non-product web pages, achieving a 6-point improvement in F1-score, 95% reduction in computational time, and 87.5% reduction in cost compared to end-to-end LLMs. Our research demonstrates the effectiveness of our proposed low-cost classification module to identify web pages containing product information, making product information extraction more effective and cost-efficient.
%U https://aclanthology.org/2024.emnlp-industry.106
%P 1444-1456
Markdown (Informal)
[A Cost-Efficient Modular Sieve for Extracting Product Information from Company Websites](https://aclanthology.org/2024.emnlp-industry.106) (Hätty et al., EMNLP 2024)
ACL
- Anna Hätty, Dragan Milchevski, Kersten Döring, Marko Putnikovic, Mohsen Mesgar, Filip Novović, Maximilian Braun, Karina Leoni Borimann, and Igor Stranjanac. 2024. A Cost-Efficient Modular Sieve for Extracting Product Information from Company Websites. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 1444–1456, Miami, Florida, US. Association for Computational Linguistics.