@inproceedings{luo-etal-2025-browsing,
title = "Browsing Like Human: A Multimodal Web Agent with Experiential Fast-and-Slow Thinking",
author = "Luo, Haohao and
Kuang, Jiayi and
Liu, Wei and
Shen, Ying and
Luan, Jian and
Deng, Yang",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.697/",
doi = "10.18653/v1/2025.acl-long.697",
pages = "14232--14251",
ISBN = "979-8-89176-251-0",
abstract = "Automating web navigation which aims to build a web agent that follows user instructions to complete tasks like booking flights by interacting with websites, has received increasing attention due to its practical value. Although existing web agents are mostly equipped with visual perception, planning, and memory abilities, their reasoning process are still deviate from human cognition. In this work, we study the human thought pattern to empower agent with more human-like abilities in web navigation. To tackle this problem, we propose a novel multimodal web agent framework called WebExperT, which is designed to emulate the human planning process of ``thinking fast and slow'' to effectively decompose complex user instructions. Furthermore, WebExperT leverages experiential learning by reflecting from failure for continuously refining planning and decision-making outcomes. Experimental results on the Mind2Web benchmark demonstrate the superiority of WebExperT in both supervised and unsupervised settings."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="luo-etal-2025-browsing">
<titleInfo>
<title>Browsing Like Human: A Multimodal Web Agent with Experiential Fast-and-Slow Thinking</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haohao</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiayi</namePart>
<namePart type="family">Kuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Shen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jian</namePart>
<namePart type="family">Luan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Automating web navigation which aims to build a web agent that follows user instructions to complete tasks like booking flights by interacting with websites, has received increasing attention due to its practical value. Although existing web agents are mostly equipped with visual perception, planning, and memory abilities, their reasoning process are still deviate from human cognition. In this work, we study the human thought pattern to empower agent with more human-like abilities in web navigation. To tackle this problem, we propose a novel multimodal web agent framework called WebExperT, which is designed to emulate the human planning process of “thinking fast and slow” to effectively decompose complex user instructions. Furthermore, WebExperT leverages experiential learning by reflecting from failure for continuously refining planning and decision-making outcomes. Experimental results on the Mind2Web benchmark demonstrate the superiority of WebExperT in both supervised and unsupervised settings.</abstract>
<identifier type="citekey">luo-etal-2025-browsing</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.697</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.697/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>14232</start>
<end>14251</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Browsing Like Human: A Multimodal Web Agent with Experiential Fast-and-Slow Thinking
%A Luo, Haohao
%A Kuang, Jiayi
%A Liu, Wei
%A Shen, Ying
%A Luan, Jian
%A Deng, Yang
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F luo-etal-2025-browsing
%X Automating web navigation which aims to build a web agent that follows user instructions to complete tasks like booking flights by interacting with websites, has received increasing attention due to its practical value. Although existing web agents are mostly equipped with visual perception, planning, and memory abilities, their reasoning process are still deviate from human cognition. In this work, we study the human thought pattern to empower agent with more human-like abilities in web navigation. To tackle this problem, we propose a novel multimodal web agent framework called WebExperT, which is designed to emulate the human planning process of “thinking fast and slow” to effectively decompose complex user instructions. Furthermore, WebExperT leverages experiential learning by reflecting from failure for continuously refining planning and decision-making outcomes. Experimental results on the Mind2Web benchmark demonstrate the superiority of WebExperT in both supervised and unsupervised settings.
%R 10.18653/v1/2025.acl-long.697
%U https://aclanthology.org/2025.acl-long.697/
%U https://doi.org/10.18653/v1/2025.acl-long.697
%P 14232-14251
Markdown (Informal)
[Browsing Like Human: A Multimodal Web Agent with Experiential Fast-and-Slow Thinking](https://aclanthology.org/2025.acl-long.697/) (Luo et al., ACL 2025)
ACL