@inproceedings{banerjee-etal-2024-standardizing,
title = "Standardizing Genomic Reports: A Dataset, A Standardized Format, and A Prompt-Based Technique for Structured Data Extraction",
author = "Banerjee, Tamali and
Varmora, Akshit and
J. Gorakhiya, Jay and
Sasidharan, Sanand and
Kanamarlapudi, Anuradha and
Bhattacharyya, Pushpak",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2024.icon-1.5/",
pages = "45--53",
abstract = "Extracting information from genomic reports of cancer patients is crucial for both healthcare professionals and cancer research. While Large Language Models (LLMs) have shown promise in extracting information, their potential for handling genomic reports remains unexplored. These reports are complex, multi-page documents that feature a variety of visually rich, structured layouts and contain many domain-specific terms. Two primary challenges complicate the process: (i) extracting data from PDFs with intricate layouts and domain-specific terminology and (ii) dealing with variations in report layouts from different laboratories, making extraction layout-dependent and posing challenges for subsequent data processing. To tackle these issues, we propose GR-PROMPT, a prompt-based technique, and GR-FORMAT, a standardized format. Together, these two convert a genomic report in PDF format into GR-FORMAT as a JSON file using a multimodal LLM. To address the lack of available datasets for this task, we introduce GR-DATASET, a synthetic collection of 100 cancer genomic reports in PDF format. Each report is accompanied by key-value information presented in a layout-specific format, as well as structured key-value information in GR-FORMAT. This is the first dataset in this domain to promote further research for the task. We performed our experiment on this dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="banerjee-etal-2024-standardizing">
<titleInfo>
<title>Standardizing Genomic Reports: A Dataset, A Standardized Format, and A Prompt-Based Technique for Structured Data Extraction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tamali</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akshit</namePart>
<namePart type="family">Varmora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">J. Gorakhiya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sanand</namePart>
<namePart type="family">Sasidharan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anuradha</namePart>
<namePart type="family">Kanamarlapudi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="family">Lalitha Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Karunesh</namePart>
<namePart type="family">Arora</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">AU-KBC Research Centre, Chennai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extracting information from genomic reports of cancer patients is crucial for both healthcare professionals and cancer research. While Large Language Models (LLMs) have shown promise in extracting information, their potential for handling genomic reports remains unexplored. These reports are complex, multi-page documents that feature a variety of visually rich, structured layouts and contain many domain-specific terms. Two primary challenges complicate the process: (i) extracting data from PDFs with intricate layouts and domain-specific terminology and (ii) dealing with variations in report layouts from different laboratories, making extraction layout-dependent and posing challenges for subsequent data processing. To tackle these issues, we propose GR-PROMPT, a prompt-based technique, and GR-FORMAT, a standardized format. Together, these two convert a genomic report in PDF format into GR-FORMAT as a JSON file using a multimodal LLM. To address the lack of available datasets for this task, we introduce GR-DATASET, a synthetic collection of 100 cancer genomic reports in PDF format. Each report is accompanied by key-value information presented in a layout-specific format, as well as structured key-value information in GR-FORMAT. This is the first dataset in this domain to promote further research for the task. We performed our experiment on this dataset.</abstract>
<identifier type="citekey">banerjee-etal-2024-standardizing</identifier>
<location>
<url>https://aclanthology.org/2024.icon-1.5/</url>
</location>
<part>
<date>2024-12</date>
<extent unit="page">
<start>45</start>
<end>53</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Standardizing Genomic Reports: A Dataset, A Standardized Format, and A Prompt-Based Technique for Structured Data Extraction
%A Banerjee, Tamali
%A Varmora, Akshit
%A J. Gorakhiya, Jay
%A Sasidharan, Sanand
%A Kanamarlapudi, Anuradha
%A Bhattacharyya, Pushpak
%Y Lalitha Devi, Sobha
%Y Arora, Karunesh
%S Proceedings of the 21st International Conference on Natural Language Processing (ICON)
%D 2024
%8 December
%I NLP Association of India (NLPAI)
%C AU-KBC Research Centre, Chennai, India
%F banerjee-etal-2024-standardizing
%X Extracting information from genomic reports of cancer patients is crucial for both healthcare professionals and cancer research. While Large Language Models (LLMs) have shown promise in extracting information, their potential for handling genomic reports remains unexplored. These reports are complex, multi-page documents that feature a variety of visually rich, structured layouts and contain many domain-specific terms. Two primary challenges complicate the process: (i) extracting data from PDFs with intricate layouts and domain-specific terminology and (ii) dealing with variations in report layouts from different laboratories, making extraction layout-dependent and posing challenges for subsequent data processing. To tackle these issues, we propose GR-PROMPT, a prompt-based technique, and GR-FORMAT, a standardized format. Together, these two convert a genomic report in PDF format into GR-FORMAT as a JSON file using a multimodal LLM. To address the lack of available datasets for this task, we introduce GR-DATASET, a synthetic collection of 100 cancer genomic reports in PDF format. Each report is accompanied by key-value information presented in a layout-specific format, as well as structured key-value information in GR-FORMAT. This is the first dataset in this domain to promote further research for the task. We performed our experiment on this dataset.
%U https://aclanthology.org/2024.icon-1.5/
%P 45-53
Markdown (Informal)
[Standardizing Genomic Reports: A Dataset, A Standardized Format, and A Prompt-Based Technique for Structured Data Extraction](https://aclanthology.org/2024.icon-1.5/) (Banerjee et al., ICON 2024)
ACL
- Tamali Banerjee, Akshit Varmora, Jay J. Gorakhiya, Sanand Sasidharan, Anuradha Kanamarlapudi, and Pushpak Bhattacharyya. 2024. Standardizing Genomic Reports: A Dataset, A Standardized Format, and A Prompt-Based Technique for Structured Data Extraction. In Proceedings of the 21st International Conference on Natural Language Processing (ICON), pages 45–53, AU-KBC Research Centre, Chennai, India. NLP Association of India (NLPAI).