@inproceedings{li-etal-2024-summarization,
title = "Summarization-Based Document {ID}s for Generative Retrieval with Language Models",
author = "Li, Alan and
Cheng, Daniel and
Keung, Phillip and
Kasai, Jungo and
Smith, Noah A.",
editor = "Lucie-Aim{\'e}e, Lucie and
Fan, Angela and
Gwadabe, Tajuddeen and
Johnson, Isaac and
Petroni, Fabio and
van Strien, Daniel",
booktitle = "Proceedings of the First Workshop on Advancing Natural Language Processing for Wikipedia",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wikinlp-1.18",
doi = "10.18653/v1/2024.wikinlp-1.18",
pages = "126--135",
abstract = "Generative retrieval (Wang et al., 2022; Tay et al., 2022) is a popular approach for end-to-end document retrieval that directly generates document identifiers given an input query. We introduce summarization-based document IDs, in which each document{'}s ID is composed of an extractive summary or abstractive keyphrases generated by a language model, rather than an integer ID sequence or bags of n-grams as proposed in past work. We find that abstractive, content-based IDs (ACID) and an ID based on the first 30 tokens are very effective in direct comparisons with previous approaches to ID creation. We show that using ACID improves top-10 and top-20 recall by 15.6{\%} and 14.4{\%} (relative) respectively versus the cluster-based integer ID baseline on the MSMARCO 100k retrieval task, and 9.8{\%} and 9.9{\%} respectively on the Wikipedia-based NQ 100k retrieval task. Our results demonstrate the effectiveness of human-readable, natural-language IDs created through summarization for generative retrieval. We also observed that extractive IDs outperformed abstractive IDs on Wikipedia articles in NQ but not the snippets in MSMARCO, which suggests that document characteristics affect generative retrieval performance.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2024-summarization">
<titleInfo>
<title>Summarization-Based Document IDs for Generative Retrieval with Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Phillip</namePart>
<namePart type="family">Keung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jungo</namePart>
<namePart type="family">Kasai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Noah</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Smith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Advancing Natural Language Processing for Wikipedia</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Lucie-Aimée</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tajuddeen</namePart>
<namePart type="family">Gwadabe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isaac</namePart>
<namePart type="family">Johnson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Petroni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">van Strien</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Generative retrieval (Wang et al., 2022; Tay et al., 2022) is a popular approach for end-to-end document retrieval that directly generates document identifiers given an input query. We introduce summarization-based document IDs, in which each document’s ID is composed of an extractive summary or abstractive keyphrases generated by a language model, rather than an integer ID sequence or bags of n-grams as proposed in past work. We find that abstractive, content-based IDs (ACID) and an ID based on the first 30 tokens are very effective in direct comparisons with previous approaches to ID creation. We show that using ACID improves top-10 and top-20 recall by 15.6% and 14.4% (relative) respectively versus the cluster-based integer ID baseline on the MSMARCO 100k retrieval task, and 9.8% and 9.9% respectively on the Wikipedia-based NQ 100k retrieval task. Our results demonstrate the effectiveness of human-readable, natural-language IDs created through summarization for generative retrieval. We also observed that extractive IDs outperformed abstractive IDs on Wikipedia articles in NQ but not the snippets in MSMARCO, which suggests that document characteristics affect generative retrieval performance.</abstract>
<identifier type="citekey">li-etal-2024-summarization</identifier>
<identifier type="doi">10.18653/v1/2024.wikinlp-1.18</identifier>
<location>
<url>https://aclanthology.org/2024.wikinlp-1.18</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>126</start>
<end>135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Summarization-Based Document IDs for Generative Retrieval with Language Models
%A Li, Alan
%A Cheng, Daniel
%A Keung, Phillip
%A Kasai, Jungo
%A Smith, Noah A.
%Y Lucie-Aimée, Lucie
%Y Fan, Angela
%Y Gwadabe, Tajuddeen
%Y Johnson, Isaac
%Y Petroni, Fabio
%Y van Strien, Daniel
%S Proceedings of the First Workshop on Advancing Natural Language Processing for Wikipedia
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F li-etal-2024-summarization
%X Generative retrieval (Wang et al., 2022; Tay et al., 2022) is a popular approach for end-to-end document retrieval that directly generates document identifiers given an input query. We introduce summarization-based document IDs, in which each document’s ID is composed of an extractive summary or abstractive keyphrases generated by a language model, rather than an integer ID sequence or bags of n-grams as proposed in past work. We find that abstractive, content-based IDs (ACID) and an ID based on the first 30 tokens are very effective in direct comparisons with previous approaches to ID creation. We show that using ACID improves top-10 and top-20 recall by 15.6% and 14.4% (relative) respectively versus the cluster-based integer ID baseline on the MSMARCO 100k retrieval task, and 9.8% and 9.9% respectively on the Wikipedia-based NQ 100k retrieval task. Our results demonstrate the effectiveness of human-readable, natural-language IDs created through summarization for generative retrieval. We also observed that extractive IDs outperformed abstractive IDs on Wikipedia articles in NQ but not the snippets in MSMARCO, which suggests that document characteristics affect generative retrieval performance.
%R 10.18653/v1/2024.wikinlp-1.18
%U https://aclanthology.org/2024.wikinlp-1.18
%U https://doi.org/10.18653/v1/2024.wikinlp-1.18
%P 126-135
Markdown (Informal)
[Summarization-Based Document IDs for Generative Retrieval with Language Models](https://aclanthology.org/2024.wikinlp-1.18) (Li et al., WikiNLP 2024)
ACL