@article{liu-etal-2026-generating,
title = "Generating Visual Stories with Grounded and Coreferent Characters",
author = "Liu, Danyang and
Lapata, Mirella and
Keller, Frank",
journal = "Transactions of the Association for Computational Linguistics",
volume = "14",
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2026.tacl-1.21/",
doi = "10.1162/tacl.a.641",
pages = "442--464",
abstract = "Characters are important in narratives. They move the plot forward, create emotional connections, and embody the story{'}s themes. Visual storytelling methods focus more on the plot and events relating to it, without building the narrative around specific characters. As a result, the generated stories feel generic, with character mentions being absent, vague, or incorrect. To mitigate these issues, we introduce a new character-centric approach to visual story generation. We present the first model capable of predicting visual stories with consistently grounded and coreferent character mentions. Our model is finetuned on a new dataset which we build on top of the widely used VIST (Huang et al., 2016) benchmark. Specifically, we develop an automated pipeline to enrich VIST with visual and textual character coreference chains. We also propose new evaluation metrics to measure the richness of characters and coreference in stories. Experimental results show that our model generates stories with recurring characters which are consistent and coreferent to larger extent compared to baselines and state-of-the-art systems.1 Our code and dataset are available at https://github.com/iz2late/character-centric-vist."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2026-generating">
<titleInfo>
<title>Generating Visual Stories with Grounded and Coreferent Characters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Danyang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mirella</namePart>
<namePart type="family">Lapata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frank</namePart>
<namePart type="family">Keller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Characters are important in narratives. They move the plot forward, create emotional connections, and embody the story’s themes. Visual storytelling methods focus more on the plot and events relating to it, without building the narrative around specific characters. As a result, the generated stories feel generic, with character mentions being absent, vague, or incorrect. To mitigate these issues, we introduce a new character-centric approach to visual story generation. We present the first model capable of predicting visual stories with consistently grounded and coreferent character mentions. Our model is finetuned on a new dataset which we build on top of the widely used VIST (Huang et al., 2016) benchmark. Specifically, we develop an automated pipeline to enrich VIST with visual and textual character coreference chains. We also propose new evaluation metrics to measure the richness of characters and coreference in stories. Experimental results show that our model generates stories with recurring characters which are consistent and coreferent to larger extent compared to baselines and state-of-the-art systems.1 Our code and dataset are available at https://github.com/iz2late/character-centric-vist.</abstract>
<identifier type="citekey">liu-etal-2026-generating</identifier>
<identifier type="doi">10.1162/tacl.a.641</identifier>
<location>
<url>https://aclanthology.org/2026.tacl-1.21/</url>
</location>
<part>
<date>2026</date>
<detail type="volume"><number>14</number></detail>
<extent unit="page">
<start>442</start>
<end>464</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Generating Visual Stories with Grounded and Coreferent Characters
%A Liu, Danyang
%A Lapata, Mirella
%A Keller, Frank
%J Transactions of the Association for Computational Linguistics
%D 2026
%V 14
%I MIT Press
%C Cambridge, MA
%F liu-etal-2026-generating
%X Characters are important in narratives. They move the plot forward, create emotional connections, and embody the story’s themes. Visual storytelling methods focus more on the plot and events relating to it, without building the narrative around specific characters. As a result, the generated stories feel generic, with character mentions being absent, vague, or incorrect. To mitigate these issues, we introduce a new character-centric approach to visual story generation. We present the first model capable of predicting visual stories with consistently grounded and coreferent character mentions. Our model is finetuned on a new dataset which we build on top of the widely used VIST (Huang et al., 2016) benchmark. Specifically, we develop an automated pipeline to enrich VIST with visual and textual character coreference chains. We also propose new evaluation metrics to measure the richness of characters and coreference in stories. Experimental results show that our model generates stories with recurring characters which are consistent and coreferent to larger extent compared to baselines and state-of-the-art systems.1 Our code and dataset are available at https://github.com/iz2late/character-centric-vist.
%R 10.1162/tacl.a.641
%U https://aclanthology.org/2026.tacl-1.21/
%U https://doi.org/10.1162/tacl.a.641
%P 442-464
Markdown (Informal)
[Generating Visual Stories with Grounded and Coreferent Characters](https://aclanthology.org/2026.tacl-1.21/) (Liu et al., TACL 2026)
ACL