@inproceedings{pham-etal-2026-analystbench,
title = "{A}nalyst{B}ench: Benchmarking professional long-form report generation with web-mined multimodal tasks",
author = "Pham, Chau Minh and
Wang, Zichao and
Mathur, Puneet and
Siu, Alexa and
Jain, Akriti and
Garimella, Aparna and
Sai, Ananya B. and
Lipka, Nedim and
Iyyer, Mohit and
Manjunatha, Varun",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1197/",
pages = "23894--23926",
ISBN = "979-8-89176-395-1",
abstract = "Large language models are increasingly used to draft long-form multimodal documents, but their end-to-end performance on professional report generation remains systematically understudied. We introduce AnalystBench, a continually extensible benchmark of 20 real-world report generation tasks grounded in multimodal document collections, where models must process millions of input tokens to produce long-form professional reports. Using expert-validated quality checklists and groundedness evaluation, we evaluate LLMs and coding agents and find that the best model, GPT-5.1, scores highly on executive summarization tasks (exceeding 90{\%} on quality checklists) but degrades substantially on tasks requiring long-horizon synthesis over large inputs (dropping to 25-40{\%}). Agent-based generation substantially benefits strong closed-source models such as GPT-5.1, with checklist scores improving by 20.24 percentage points and visual coverage by 37.41 points over vanilla generation, but offers little or negative gains for open-source models like DeepSeek-R1 (-3.02 points). Expert reviewers note that while generated reports are grounded and clearly separate factual description from interpretation, they often fall short in actionability, clarity, and quantitative precision, which highlights the gap between system performance and real-world professional needs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pham-etal-2026-analystbench">
<titleInfo>
<title>AnalystBench: Benchmarking professional long-form report generation with web-mined multimodal tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chau</namePart>
<namePart type="given">Minh</namePart>
<namePart type="family">Pham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zichao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Puneet</namePart>
<namePart type="family">Mathur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexa</namePart>
<namePart type="family">Siu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akriti</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Garimella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ananya</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Sai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nedim</namePart>
<namePart type="family">Lipka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Iyyer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Varun</namePart>
<namePart type="family">Manjunatha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models are increasingly used to draft long-form multimodal documents, but their end-to-end performance on professional report generation remains systematically understudied. We introduce AnalystBench, a continually extensible benchmark of 20 real-world report generation tasks grounded in multimodal document collections, where models must process millions of input tokens to produce long-form professional reports. Using expert-validated quality checklists and groundedness evaluation, we evaluate LLMs and coding agents and find that the best model, GPT-5.1, scores highly on executive summarization tasks (exceeding 90% on quality checklists) but degrades substantially on tasks requiring long-horizon synthesis over large inputs (dropping to 25-40%). Agent-based generation substantially benefits strong closed-source models such as GPT-5.1, with checklist scores improving by 20.24 percentage points and visual coverage by 37.41 points over vanilla generation, but offers little or negative gains for open-source models like DeepSeek-R1 (-3.02 points). Expert reviewers note that while generated reports are grounded and clearly separate factual description from interpretation, they often fall short in actionability, clarity, and quantitative precision, which highlights the gap between system performance and real-world professional needs.</abstract>
<identifier type="citekey">pham-etal-2026-analystbench</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1197/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>23894</start>
<end>23926</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AnalystBench: Benchmarking professional long-form report generation with web-mined multimodal tasks
%A Pham, Chau Minh
%A Wang, Zichao
%A Mathur, Puneet
%A Siu, Alexa
%A Jain, Akriti
%A Garimella, Aparna
%A Sai, Ananya B.
%A Lipka, Nedim
%A Iyyer, Mohit
%A Manjunatha, Varun
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F pham-etal-2026-analystbench
%X Large language models are increasingly used to draft long-form multimodal documents, but their end-to-end performance on professional report generation remains systematically understudied. We introduce AnalystBench, a continually extensible benchmark of 20 real-world report generation tasks grounded in multimodal document collections, where models must process millions of input tokens to produce long-form professional reports. Using expert-validated quality checklists and groundedness evaluation, we evaluate LLMs and coding agents and find that the best model, GPT-5.1, scores highly on executive summarization tasks (exceeding 90% on quality checklists) but degrades substantially on tasks requiring long-horizon synthesis over large inputs (dropping to 25-40%). Agent-based generation substantially benefits strong closed-source models such as GPT-5.1, with checklist scores improving by 20.24 percentage points and visual coverage by 37.41 points over vanilla generation, but offers little or negative gains for open-source models like DeepSeek-R1 (-3.02 points). Expert reviewers note that while generated reports are grounded and clearly separate factual description from interpretation, they often fall short in actionability, clarity, and quantitative precision, which highlights the gap between system performance and real-world professional needs.
%U https://aclanthology.org/2026.findings-acl.1197/
%P 23894-23926
Markdown (Informal)
[AnalystBench: Benchmarking professional long-form report generation with web-mined multimodal tasks](https://aclanthology.org/2026.findings-acl.1197/) (Pham et al., Findings 2026)
ACL
- Chau Minh Pham, Zichao Wang, Puneet Mathur, Alexa Siu, Akriti Jain, Aparna Garimella, Ananya B. Sai, Nedim Lipka, Mohit Iyyer, and Varun Manjunatha. 2026. AnalystBench: Benchmarking professional long-form report generation with web-mined multimodal tasks. In Findings of the Association for Computational Linguistics: ACL 2026, pages 23894–23926, San Diego, California, United States. Association for Computational Linguistics.