@inproceedings{tu-etal-2026-fico,
title = "Fico: Evaluating Vision-Language Models under Visual Fidelity and Compression at Scale",
author = "Tu, Jianhong and
Crispino, Nicholas and
Montgomery, Kyle and
Wang, Chenguang and
Song, Dawn",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1758/",
pages = "35261--35277",
ISBN = "979-8-89176-395-1",
abstract = "Visual text compression is an emerging paradigm for rendering text as images for processing by vision-language models (VLMs), enabling higher information density per context token. However, the robustness of VLMs under dense, text-based visual inputs remains unevaluated. We introduce Fico, a benchmark designed to assess VLM robustness across seven controlled variants of visual fidelity and information density. Fico spans documents of 8k to 64k tokens and includes three tasks of increasing semantic granularity: optical character recognition (OCR), needle-in-a-haystack (NIAH) retrieval, and visual question answering (VQA). Evaluating 13 general-purpose VLMs and 3 OCR-specialized models reveals three consistent trends: performance drops sharply under increased density or reduced resolution; cross-task transfer between OCR, NIAH, and VQA is limited; and VQA is comparatively robust because low-level details are lost before high-level semantics. By exposing failure modes that remain invisible under conventional VLM evaluations, Fico establishes a rigorous test-bed for visual text compression."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tu-etal-2026-fico">
<titleInfo>
<title>Fico: Evaluating Vision-Language Models under Visual Fidelity and Compression at Scale</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jianhong</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicholas</namePart>
<namePart type="family">Crispino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Montgomery</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenguang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dawn</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Visual text compression is an emerging paradigm for rendering text as images for processing by vision-language models (VLMs), enabling higher information density per context token. However, the robustness of VLMs under dense, text-based visual inputs remains unevaluated. We introduce Fico, a benchmark designed to assess VLM robustness across seven controlled variants of visual fidelity and information density. Fico spans documents of 8k to 64k tokens and includes three tasks of increasing semantic granularity: optical character recognition (OCR), needle-in-a-haystack (NIAH) retrieval, and visual question answering (VQA). Evaluating 13 general-purpose VLMs and 3 OCR-specialized models reveals three consistent trends: performance drops sharply under increased density or reduced resolution; cross-task transfer between OCR, NIAH, and VQA is limited; and VQA is comparatively robust because low-level details are lost before high-level semantics. By exposing failure modes that remain invisible under conventional VLM evaluations, Fico establishes a rigorous test-bed for visual text compression.</abstract>
<identifier type="citekey">tu-etal-2026-fico</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1758/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35261</start>
<end>35277</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fico: Evaluating Vision-Language Models under Visual Fidelity and Compression at Scale
%A Tu, Jianhong
%A Crispino, Nicholas
%A Montgomery, Kyle
%A Wang, Chenguang
%A Song, Dawn
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F tu-etal-2026-fico
%X Visual text compression is an emerging paradigm for rendering text as images for processing by vision-language models (VLMs), enabling higher information density per context token. However, the robustness of VLMs under dense, text-based visual inputs remains unevaluated. We introduce Fico, a benchmark designed to assess VLM robustness across seven controlled variants of visual fidelity and information density. Fico spans documents of 8k to 64k tokens and includes three tasks of increasing semantic granularity: optical character recognition (OCR), needle-in-a-haystack (NIAH) retrieval, and visual question answering (VQA). Evaluating 13 general-purpose VLMs and 3 OCR-specialized models reveals three consistent trends: performance drops sharply under increased density or reduced resolution; cross-task transfer between OCR, NIAH, and VQA is limited; and VQA is comparatively robust because low-level details are lost before high-level semantics. By exposing failure modes that remain invisible under conventional VLM evaluations, Fico establishes a rigorous test-bed for visual text compression.
%U https://aclanthology.org/2026.findings-acl.1758/
%P 35261-35277
Markdown (Informal)
[Fico: Evaluating Vision-Language Models under Visual Fidelity and Compression at Scale](https://aclanthology.org/2026.findings-acl.1758/) (Tu et al., Findings 2026)
ACL