@inproceedings{karmakar-etal-2026-triptide,
title = "{T}rip{T}ide: A Benchmark for Adaptive Travel Planning under Disruptions",
author = "Karmakar, Priyanshu and
Chaudhuri, Soumyabrata and
Mallick, Shubhojit and
Gupta, Manish and
Jana, Abhik and
Ghosh, Shreya",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.2002/",
pages = "40269--40292",
ISBN = "979-8-89176-395-1",
abstract = "Recent work, such as TripCraft and TravelPlanner, has shown the promise of Large Language Models (LLMs) for personalized, constraint-aware travel itinerary generation. However, real-world travel often involves disruptions such as transit cancellations, weather-related closures, or overbooked attractions. To address this gap, we introduce **TripTide**, the first benchmark designed to evaluate the ability of LLMs to revise travel itineraries under realistic disruptions.TripTide models both disruption severity and traveler tolerance, enabling systematic evaluation of how LLMs respond to unexpected travel events. The benchmark simulates scenarios where existing itineraries must be revised while preserving the traveler{'}s original intent and respecting practical constraints. We conduct a three-fold evaluation of itinerary revision quality: (i) Automatic metrics measuring *Preservation of Intent*, *Responsiveness*, and *Adaptability* (semantic, spatial, and sequential), (ii) LLM-as-a-Judge evaluation assessing the quality and plausibility of revised itineraries and (iii) Human evaluation examining overall revision quality and user satisfaction.Our findings show that LLMs generally preserve semantic intent and sequential structure, while spatial deviations are more pronounced in shorter itineraries and diminish for longer ones. However, the ability to handle disruptions degrades as itinerary length increases, highlighting limitations in long-horizon itinerary revision. The TripTide benchmark provides a foundation for systematically evaluating robustness and adaptability in LLM-based travel planning systems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karmakar-etal-2026-triptide">
<titleInfo>
<title>TripTide: A Benchmark for Adaptive Travel Planning under Disruptions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Priyanshu</namePart>
<namePart type="family">Karmakar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soumyabrata</namePart>
<namePart type="family">Chaudhuri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shubhojit</namePart>
<namePart type="family">Mallick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manish</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abhik</namePart>
<namePart type="family">Jana</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shreya</namePart>
<namePart type="family">Ghosh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Recent work, such as TripCraft and TravelPlanner, has shown the promise of Large Language Models (LLMs) for personalized, constraint-aware travel itinerary generation. However, real-world travel often involves disruptions such as transit cancellations, weather-related closures, or overbooked attractions. To address this gap, we introduce **TripTide**, the first benchmark designed to evaluate the ability of LLMs to revise travel itineraries under realistic disruptions.TripTide models both disruption severity and traveler tolerance, enabling systematic evaluation of how LLMs respond to unexpected travel events. The benchmark simulates scenarios where existing itineraries must be revised while preserving the traveler’s original intent and respecting practical constraints. We conduct a three-fold evaluation of itinerary revision quality: (i) Automatic metrics measuring *Preservation of Intent*, *Responsiveness*, and *Adaptability* (semantic, spatial, and sequential), (ii) LLM-as-a-Judge evaluation assessing the quality and plausibility of revised itineraries and (iii) Human evaluation examining overall revision quality and user satisfaction.Our findings show that LLMs generally preserve semantic intent and sequential structure, while spatial deviations are more pronounced in shorter itineraries and diminish for longer ones. However, the ability to handle disruptions degrades as itinerary length increases, highlighting limitations in long-horizon itinerary revision. The TripTide benchmark provides a foundation for systematically evaluating robustness and adaptability in LLM-based travel planning systems.</abstract>
<identifier type="citekey">karmakar-etal-2026-triptide</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.2002/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>40269</start>
<end>40292</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TripTide: A Benchmark for Adaptive Travel Planning under Disruptions
%A Karmakar, Priyanshu
%A Chaudhuri, Soumyabrata
%A Mallick, Shubhojit
%A Gupta, Manish
%A Jana, Abhik
%A Ghosh, Shreya
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F karmakar-etal-2026-triptide
%X Recent work, such as TripCraft and TravelPlanner, has shown the promise of Large Language Models (LLMs) for personalized, constraint-aware travel itinerary generation. However, real-world travel often involves disruptions such as transit cancellations, weather-related closures, or overbooked attractions. To address this gap, we introduce **TripTide**, the first benchmark designed to evaluate the ability of LLMs to revise travel itineraries under realistic disruptions.TripTide models both disruption severity and traveler tolerance, enabling systematic evaluation of how LLMs respond to unexpected travel events. The benchmark simulates scenarios where existing itineraries must be revised while preserving the traveler’s original intent and respecting practical constraints. We conduct a three-fold evaluation of itinerary revision quality: (i) Automatic metrics measuring *Preservation of Intent*, *Responsiveness*, and *Adaptability* (semantic, spatial, and sequential), (ii) LLM-as-a-Judge evaluation assessing the quality and plausibility of revised itineraries and (iii) Human evaluation examining overall revision quality and user satisfaction.Our findings show that LLMs generally preserve semantic intent and sequential structure, while spatial deviations are more pronounced in shorter itineraries and diminish for longer ones. However, the ability to handle disruptions degrades as itinerary length increases, highlighting limitations in long-horizon itinerary revision. The TripTide benchmark provides a foundation for systematically evaluating robustness and adaptability in LLM-based travel planning systems.
%U https://aclanthology.org/2026.findings-acl.2002/
%P 40269-40292
Markdown (Informal)
[TripTide: A Benchmark for Adaptive Travel Planning under Disruptions](https://aclanthology.org/2026.findings-acl.2002/) (Karmakar et al., Findings 2026)
ACL
- Priyanshu Karmakar, Soumyabrata Chaudhuri, Shubhojit Mallick, Manish Gupta, Abhik Jana, and Shreya Ghosh. 2026. TripTide: A Benchmark for Adaptive Travel Planning under Disruptions. In Findings of the Association for Computational Linguistics: ACL 2026, pages 40269–40292, San Diego, California, United States. Association for Computational Linguistics.