@article{hsu-etal-2026-large,
title = "Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from {S}ci{C}ap Challenge 2023",
author = "Hsu, Ting-Yao and
Hsu, Yi-Li and
Rohatgi, Shaurya and
Huang, Chieh-Yang and
Ng, Ho Yin Sam and
Rossi, Ryan and
Kim, Sungchul and
Yu, Tong and
Ku, Lun-Wei and
Giles, Clyde Lee and
Huang, Ting-Hao",
journal = "Transactions of the Association for Computational Linguistics",
volume = "14",
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2026.tacl-1.12/",
doi = "10.1162/tacl.a.653",
pages = "233--252",
abstract = "Since the SciCap dataset{'}s launch in 2021, the research community has made significant progress in generating captions for scientific figures in scholarly articles. In 2023, the first SciCap Challenge took place, inviting global teams to use an expanded SciCap dataset to develop models for captioning diverse figure types across various academic fields. At the same time, text generation models advanced quickly, with many powerful pre-trained large multimodal models (LMMs) emerging that showed impressive capabilities in various vision-and-language tasks. This paper presents an overview of the first SciCap Challenge and details the performance of various models on its data, capturing a snapshot of the field{'}s state. We found that professional editors overwhelmingly preferred figure captions generated by GPT-4V over those from all other models and even the original captions written by authors. Following this key finding, we conducted detailed analyses to answer this question: Have advanced LMMs solved the task of generating captions for scientific figures?"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hsu-etal-2026-large">
<titleInfo>
<title>Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from SciCap Challenge 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ting-Yao</namePart>
<namePart type="family">Hsu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi-Li</namePart>
<namePart type="family">Hsu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shaurya</namePart>
<namePart type="family">Rohatgi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chieh-Yang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ho</namePart>
<namePart type="given">Yin</namePart>
<namePart type="given">Sam</namePart>
<namePart type="family">Ng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Rossi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungchul</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clyde</namePart>
<namePart type="given">Lee</namePart>
<namePart type="family">Giles</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ting-Hao</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Since the SciCap dataset’s launch in 2021, the research community has made significant progress in generating captions for scientific figures in scholarly articles. In 2023, the first SciCap Challenge took place, inviting global teams to use an expanded SciCap dataset to develop models for captioning diverse figure types across various academic fields. At the same time, text generation models advanced quickly, with many powerful pre-trained large multimodal models (LMMs) emerging that showed impressive capabilities in various vision-and-language tasks. This paper presents an overview of the first SciCap Challenge and details the performance of various models on its data, capturing a snapshot of the field’s state. We found that professional editors overwhelmingly preferred figure captions generated by GPT-4V over those from all other models and even the original captions written by authors. Following this key finding, we conducted detailed analyses to answer this question: Have advanced LMMs solved the task of generating captions for scientific figures?</abstract>
<identifier type="citekey">hsu-etal-2026-large</identifier>
<identifier type="doi">10.1162/tacl.a.653</identifier>
<location>
<url>https://aclanthology.org/2026.tacl-1.12/</url>
</location>
<part>
<date>2026</date>
<detail type="volume"><number>14</number></detail>
<extent unit="page">
<start>233</start>
<end>252</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from SciCap Challenge 2023
%A Hsu, Ting-Yao
%A Hsu, Yi-Li
%A Rohatgi, Shaurya
%A Huang, Chieh-Yang
%A Ng, Ho Yin Sam
%A Rossi, Ryan
%A Kim, Sungchul
%A Yu, Tong
%A Ku, Lun-Wei
%A Giles, Clyde Lee
%A Huang, Ting-Hao
%J Transactions of the Association for Computational Linguistics
%D 2026
%V 14
%I MIT Press
%C Cambridge, MA
%F hsu-etal-2026-large
%X Since the SciCap dataset’s launch in 2021, the research community has made significant progress in generating captions for scientific figures in scholarly articles. In 2023, the first SciCap Challenge took place, inviting global teams to use an expanded SciCap dataset to develop models for captioning diverse figure types across various academic fields. At the same time, text generation models advanced quickly, with many powerful pre-trained large multimodal models (LMMs) emerging that showed impressive capabilities in various vision-and-language tasks. This paper presents an overview of the first SciCap Challenge and details the performance of various models on its data, capturing a snapshot of the field’s state. We found that professional editors overwhelmingly preferred figure captions generated by GPT-4V over those from all other models and even the original captions written by authors. Following this key finding, we conducted detailed analyses to answer this question: Have advanced LMMs solved the task of generating captions for scientific figures?
%R 10.1162/tacl.a.653
%U https://aclanthology.org/2026.tacl-1.12/
%U https://doi.org/10.1162/tacl.a.653
%P 233-252
Markdown (Informal)
[Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from SciCap Challenge 2023](https://aclanthology.org/2026.tacl-1.12/) (Hsu et al., TACL 2026)
ACL
- Ting-Yao Hsu, Yi-Li Hsu, Shaurya Rohatgi, Chieh-Yang Huang, Ho Yin Sam Ng, Ryan Rossi, Sungchul Kim, Tong Yu, Lun-Wei Ku, Clyde Lee Giles, and Ting-Hao Huang. 2026. Do Large Multimodal Models Solve Caption Generation for Scientific Figures? Lessons Learned from SciCap Challenge 2023. Transactions of the Association for Computational Linguistics, 14:233–252.