@inproceedings{pai-etal-2023-exploration,
title = "Exploration of Open Large Language Models for e{D}iscovery",
author = "Pai, Sumit and
Lahiri, Sounak and
Kumar, Ujjwal and
Baksi, Krishanu and
Soba, Elijah and
Suesserman, Michael and
Pudota, Nirmala and
Foster, Jon and
Bowen, Edward and
Bhattacharya, Sanmitra",
editor = "Preo{\textcommabelow{t}}iuc-Pietro, Daniel and
Goanta, Catalina and
Chalkidis, Ilias and
Barrett, Leslie and
Spanakis, Gerasimos and
Aletras, Nikolaos",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.nllp-1.17",
doi = "10.18653/v1/2023.nllp-1.17",
pages = "166--177",
abstract = "The rapid advancement of Generative Artificial Intelligence (AI), particularly Large Language Models (LLMs), has led to their widespread adoption for various natural language processing (NLP) tasks. One crucial domain ripe for innovation is the Technology-Assisted Review (TAR) process in Electronic discovery (eDiscovery). Traditionally, TAR involves manual review and classification of documents for relevance over large document collections for litigations and investigations. This process is aided by machine learning and NLP tools which require extensive training and fine-tuning. In this paper, we explore the application of LLMs to TAR, specifically for predictive coding. We experiment with out-of-the-box prompting and fine-tuning of LLMs using parameter-efficient techniques. We conduct experiments using open LLMs and compare them to commercially-licensed ones. Our experiments demonstrate that open LLMs lag behind commercially-licensed models in relevance classification using out-of-the-box prompting. However, topic-specific instruction tuning of open LLMs not only improve their effectiveness but can often outperform their commercially-licensed counterparts in performance evaluations. Additionally, we conduct a user study to gauge the preferences of our eDiscovery Subject Matter Specialists (SMS) regarding human-authored versus model-generated reasoning. We demonstrate that instruction-tuned open LLMs can generate high quality reasonings that are comparable to commercial LLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="pai-etal-2023-exploration">
<titleInfo>
<title>Exploration of Open Large Language Models for eDiscovery</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sumit</namePart>
<namePart type="family">Pai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sounak</namePart>
<namePart type="family">Lahiri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ujjwal</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Krishanu</namePart>
<namePart type="family">Baksi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elijah</namePart>
<namePart type="family">Soba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Suesserman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nirmala</namePart>
<namePart type="family">Pudota</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Foster</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edward</namePart>
<namePart type="family">Bowen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanmitra</namePart>
<namePart type="family">Bhattacharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Natural Legal Language Processing Workshop 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preo\textcommabelowtiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Catalina</namePart>
<namePart type="family">Goanta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilias</namePart>
<namePart type="family">Chalkidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leslie</namePart>
<namePart type="family">Barrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gerasimos</namePart>
<namePart type="family">Spanakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The rapid advancement of Generative Artificial Intelligence (AI), particularly Large Language Models (LLMs), has led to their widespread adoption for various natural language processing (NLP) tasks. One crucial domain ripe for innovation is the Technology-Assisted Review (TAR) process in Electronic discovery (eDiscovery). Traditionally, TAR involves manual review and classification of documents for relevance over large document collections for litigations and investigations. This process is aided by machine learning and NLP tools which require extensive training and fine-tuning. In this paper, we explore the application of LLMs to TAR, specifically for predictive coding. We experiment with out-of-the-box prompting and fine-tuning of LLMs using parameter-efficient techniques. We conduct experiments using open LLMs and compare them to commercially-licensed ones. Our experiments demonstrate that open LLMs lag behind commercially-licensed models in relevance classification using out-of-the-box prompting. However, topic-specific instruction tuning of open LLMs not only improve their effectiveness but can often outperform their commercially-licensed counterparts in performance evaluations. Additionally, we conduct a user study to gauge the preferences of our eDiscovery Subject Matter Specialists (SMS) regarding human-authored versus model-generated reasoning. We demonstrate that instruction-tuned open LLMs can generate high quality reasonings that are comparable to commercial LLMs.</abstract>
<identifier type="citekey">pai-etal-2023-exploration</identifier>
<identifier type="doi">10.18653/v1/2023.nllp-1.17</identifier>
<location>
<url>https://aclanthology.org/2023.nllp-1.17</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>166</start>
<end>177</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploration of Open Large Language Models for eDiscovery
%A Pai, Sumit
%A Lahiri, Sounak
%A Kumar, Ujjwal
%A Baksi, Krishanu
%A Soba, Elijah
%A Suesserman, Michael
%A Pudota, Nirmala
%A Foster, Jon
%A Bowen, Edward
%A Bhattacharya, Sanmitra
%Y Preo\textcommabelowtiuc-Pietro, Daniel
%Y Goanta, Catalina
%Y Chalkidis, Ilias
%Y Barrett, Leslie
%Y Spanakis, Gerasimos
%Y Aletras, Nikolaos
%S Proceedings of the Natural Legal Language Processing Workshop 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F pai-etal-2023-exploration
%X The rapid advancement of Generative Artificial Intelligence (AI), particularly Large Language Models (LLMs), has led to their widespread adoption for various natural language processing (NLP) tasks. One crucial domain ripe for innovation is the Technology-Assisted Review (TAR) process in Electronic discovery (eDiscovery). Traditionally, TAR involves manual review and classification of documents for relevance over large document collections for litigations and investigations. This process is aided by machine learning and NLP tools which require extensive training and fine-tuning. In this paper, we explore the application of LLMs to TAR, specifically for predictive coding. We experiment with out-of-the-box prompting and fine-tuning of LLMs using parameter-efficient techniques. We conduct experiments using open LLMs and compare them to commercially-licensed ones. Our experiments demonstrate that open LLMs lag behind commercially-licensed models in relevance classification using out-of-the-box prompting. However, topic-specific instruction tuning of open LLMs not only improve their effectiveness but can often outperform their commercially-licensed counterparts in performance evaluations. Additionally, we conduct a user study to gauge the preferences of our eDiscovery Subject Matter Specialists (SMS) regarding human-authored versus model-generated reasoning. We demonstrate that instruction-tuned open LLMs can generate high quality reasonings that are comparable to commercial LLMs.
%R 10.18653/v1/2023.nllp-1.17
%U https://aclanthology.org/2023.nllp-1.17
%U https://doi.org/10.18653/v1/2023.nllp-1.17
%P 166-177
Markdown (Informal)
[Exploration of Open Large Language Models for eDiscovery](https://aclanthology.org/2023.nllp-1.17) (Pai et al., NLLP-WS 2023)
ACL
- Sumit Pai, Sounak Lahiri, Ujjwal Kumar, Krishanu Baksi, Elijah Soba, Michael Suesserman, Nirmala Pudota, Jon Foster, Edward Bowen, and Sanmitra Bhattacharya. 2023. Exploration of Open Large Language Models for eDiscovery. In Proceedings of the Natural Legal Language Processing Workshop 2023, pages 166–177, Singapore. Association for Computational Linguistics.