@article{arora-etal-2023-reasoning,
title = "Reasoning over Public and Private Data in Retrieval-Based Systems",
author = "Arora, Simran and
Lewis, Patrick and
Fan, Angela and
Kahn, Jacob and
R{\'e}, Christopher",
journal = "Transactions of the Association for Computational Linguistics",
volume = "11",
year = "2023",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2023.tacl-1.51",
doi = "10.1162/tacl_a_00580",
pages = "902--921",
abstract = "Users an organizations are generating ever-increasing amounts of private data from a wide range of sources. Incorporating private context is important to personalize open-domain tasks such as question-answering, fact-checking, and personal assistants. State-of-the-art systems for these tasks explicitly retrieve information that is relevant to an input question from a background corpus before producing an answer. While today{'}s retrieval systems assume relevant corpora are fully (e.g., publicly) accessible, users are often unable or unwilling to expose their private data to entities hosting public data. We define the Split Iterative Retrieval (SPIRAL) problem involving iterative retrieval over multiple privacy scopes. We introduce a foundational benchmark with which to study SPIRAL, as no existing benchmark includes data from a private distribution. Our dataset, ConcurrentQA, includes data from distinct public and private distributions and is the first textual QA benchmark requiring concurrent retrieval over multiple distributions. Finally, we show that existing retrieval approaches face significant performance degradations when applied to our proposed retrieval setting and investigate approaches with which these tradeoffs can be mitigated. We release the new benchmark and code to reproduce the results.1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arora-etal-2023-reasoning">
<titleInfo>
<title>Reasoning over Public and Private Data in Retrieval-Based Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simran</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrick</namePart>
<namePart type="family">Lewis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Angela</namePart>
<namePart type="family">Fan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="family">Kahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Ré</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>Users an organizations are generating ever-increasing amounts of private data from a wide range of sources. Incorporating private context is important to personalize open-domain tasks such as question-answering, fact-checking, and personal assistants. State-of-the-art systems for these tasks explicitly retrieve information that is relevant to an input question from a background corpus before producing an answer. While today’s retrieval systems assume relevant corpora are fully (e.g., publicly) accessible, users are often unable or unwilling to expose their private data to entities hosting public data. We define the Split Iterative Retrieval (SPIRAL) problem involving iterative retrieval over multiple privacy scopes. We introduce a foundational benchmark with which to study SPIRAL, as no existing benchmark includes data from a private distribution. Our dataset, ConcurrentQA, includes data from distinct public and private distributions and is the first textual QA benchmark requiring concurrent retrieval over multiple distributions. Finally, we show that existing retrieval approaches face significant performance degradations when applied to our proposed retrieval setting and investigate approaches with which these tradeoffs can be mitigated. We release the new benchmark and code to reproduce the results.1</abstract>
<identifier type="citekey">arora-etal-2023-reasoning</identifier>
<identifier type="doi">10.1162/tacl_a_00580</identifier>
<location>
<url>https://aclanthology.org/2023.tacl-1.51</url>
</location>
<part>
<date>2023</date>
<detail type="volume"><number>11</number></detail>
<extent unit="page">
<start>902</start>
<end>921</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Reasoning over Public and Private Data in Retrieval-Based Systems
%A Arora, Simran
%A Lewis, Patrick
%A Fan, Angela
%A Kahn, Jacob
%A Ré, Christopher
%J Transactions of the Association for Computational Linguistics
%D 2023
%V 11
%I MIT Press
%C Cambridge, MA
%F arora-etal-2023-reasoning
%X Users an organizations are generating ever-increasing amounts of private data from a wide range of sources. Incorporating private context is important to personalize open-domain tasks such as question-answering, fact-checking, and personal assistants. State-of-the-art systems for these tasks explicitly retrieve information that is relevant to an input question from a background corpus before producing an answer. While today’s retrieval systems assume relevant corpora are fully (e.g., publicly) accessible, users are often unable or unwilling to expose their private data to entities hosting public data. We define the Split Iterative Retrieval (SPIRAL) problem involving iterative retrieval over multiple privacy scopes. We introduce a foundational benchmark with which to study SPIRAL, as no existing benchmark includes data from a private distribution. Our dataset, ConcurrentQA, includes data from distinct public and private distributions and is the first textual QA benchmark requiring concurrent retrieval over multiple distributions. Finally, we show that existing retrieval approaches face significant performance degradations when applied to our proposed retrieval setting and investigate approaches with which these tradeoffs can be mitigated. We release the new benchmark and code to reproduce the results.1
%R 10.1162/tacl_a_00580
%U https://aclanthology.org/2023.tacl-1.51
%U https://doi.org/10.1162/tacl_a_00580
%P 902-921
Markdown (Informal)
[Reasoning over Public and Private Data in Retrieval-Based Systems](https://aclanthology.org/2023.tacl-1.51) (Arora et al., TACL 2023)
ACL