@inproceedings{ramu-etal-2024-zooming-zero,
title = "Zooming in on Zero-Shot Intent-Guided and Grounded Document Generation using {LLM}s",
author = "Ramu, Pritika and
Gaur, Pranshu and
Emandi, Rishita and
Maheshwari, Himanshu and
Javed, Danish and
Garimella, Aparna",
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-main.52",
pages = "676--694",
abstract = "Repurposing existing content on-the-fly to suit author{'}s goals for creating initial drafts is crucial for document creation. We introduce the task of intent-guided and grounded document generation: given a user-specified intent (e.g., section title) and a few reference documents, the goal is to generate section-level multimodal documents spanning text and images, grounded on the given references, in a zero-shot setting. We present a data curation strategy to obtain general-domain samples from Wikipedia, and collect 1,000 Wikipedia sections consisting of textual and image content along with appropriate intent specifications and references. We propose a simple yet effective planning-based prompting strategy, Multimodal Plan-And-Write (MM-PAW), to prompt LLMs to generate an intermediate plan with text and image descriptions, to guide the subsequent generation. We compare the performances of MM-PAW and a text-only variant of it with those of zero-shot Chain-of-Thought (CoT) using recent close and open-domain LLMs. Both of them lead to significantly better performances in terms of content relevance, structure, and groundedness to the references, more so in the smaller models (upto 12.5 points increase in Rouge 1-F1) than in the larger ones (upto 4 points increase in R1-F1). They are particularly effective in improving relatively smaller models{'} performances, to be on par or higher than those of their larger counterparts for this task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ramu-etal-2024-zooming-zero">
<titleInfo>
<title>Zooming in on Zero-Shot Intent-Guided and Grounded Document Generation using LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pritika</namePart>
<namePart type="family">Ramu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pranshu</namePart>
<namePart type="family">Gaur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rishita</namePart>
<namePart type="family">Emandi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Himanshu</namePart>
<namePart type="family">Maheshwari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danish</namePart>
<namePart type="family">Javed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aparna</namePart>
<namePart type="family">Garimella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Minh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Repurposing existing content on-the-fly to suit author’s goals for creating initial drafts is crucial for document creation. We introduce the task of intent-guided and grounded document generation: given a user-specified intent (e.g., section title) and a few reference documents, the goal is to generate section-level multimodal documents spanning text and images, grounded on the given references, in a zero-shot setting. We present a data curation strategy to obtain general-domain samples from Wikipedia, and collect 1,000 Wikipedia sections consisting of textual and image content along with appropriate intent specifications and references. We propose a simple yet effective planning-based prompting strategy, Multimodal Plan-And-Write (MM-PAW), to prompt LLMs to generate an intermediate plan with text and image descriptions, to guide the subsequent generation. We compare the performances of MM-PAW and a text-only variant of it with those of zero-shot Chain-of-Thought (CoT) using recent close and open-domain LLMs. Both of them lead to significantly better performances in terms of content relevance, structure, and groundedness to the references, more so in the smaller models (upto 12.5 points increase in Rouge 1-F1) than in the larger ones (upto 4 points increase in R1-F1). They are particularly effective in improving relatively smaller models’ performances, to be on par or higher than those of their larger counterparts for this task.</abstract>
<identifier type="citekey">ramu-etal-2024-zooming-zero</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-main.52</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>676</start>
<end>694</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Zooming in on Zero-Shot Intent-Guided and Grounded Document Generation using LLMs
%A Ramu, Pritika
%A Gaur, Pranshu
%A Emandi, Rishita
%A Maheshwari, Himanshu
%A Javed, Danish
%A Garimella, Aparna
%Y Mahamood, Saad
%Y Minh, Nguyen Le
%Y Ippolito, Daphne
%S Proceedings of the 17th International Natural Language Generation Conference
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F ramu-etal-2024-zooming-zero
%X Repurposing existing content on-the-fly to suit author’s goals for creating initial drafts is crucial for document creation. We introduce the task of intent-guided and grounded document generation: given a user-specified intent (e.g., section title) and a few reference documents, the goal is to generate section-level multimodal documents spanning text and images, grounded on the given references, in a zero-shot setting. We present a data curation strategy to obtain general-domain samples from Wikipedia, and collect 1,000 Wikipedia sections consisting of textual and image content along with appropriate intent specifications and references. We propose a simple yet effective planning-based prompting strategy, Multimodal Plan-And-Write (MM-PAW), to prompt LLMs to generate an intermediate plan with text and image descriptions, to guide the subsequent generation. We compare the performances of MM-PAW and a text-only variant of it with those of zero-shot Chain-of-Thought (CoT) using recent close and open-domain LLMs. Both of them lead to significantly better performances in terms of content relevance, structure, and groundedness to the references, more so in the smaller models (upto 12.5 points increase in Rouge 1-F1) than in the larger ones (upto 4 points increase in R1-F1). They are particularly effective in improving relatively smaller models’ performances, to be on par or higher than those of their larger counterparts for this task.
%U https://aclanthology.org/2024.inlg-main.52
%P 676-694
Markdown (Informal)
[Zooming in on Zero-Shot Intent-Guided and Grounded Document Generation using LLMs](https://aclanthology.org/2024.inlg-main.52) (Ramu et al., INLG 2024)
ACL