@inproceedings{jobanputra-demberg-2024-teamsaarlst,
title = "{T}eam{S}aar{LST} at the {GEM}{'}24 Data-to-text Task: Revisiting symbolic retrieval in the {LLM}-age",
author = "Jobanputra, Mayank and
Demberg, Vera",
editor = "Mille, Simon and
Clinciu, Miruna-Adriana",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference: Generation Challenges",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-genchal.10",
pages = "92--99",
abstract = "Data-to-text (D2T) generation is a natural language generation (NLG) task in which a system describes structured data in natural language. Generating natural language verbalization for structured data is challenging as the data may not contain all the required details (here, properties such as gender are missing from the input data and need to be inferred for correct language generation), and because the structured data may conflict with the knowledge contained in the LLM{'}s parameters learned during pre-training. Both of these factors (incorrect filling in of details, pretraining conflict and input data) can lead to so-called hallucinations. In this paper, we propose a few-shot retrieval augmented generation (RAG) system, using a symbolic retriever {--} PropertyRetriever. Additionally, we experiment with state-of-the-art large language models (LLMs) to generate data verbalizations. Our system achieves the best results on 4 out of 6 subtasks for METEOR and chrF++ metrics. We present our results along with an error analysis. We release our code for reproducing the results as well as the generated verbalizations from all the experiments for any further explorations here.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jobanputra-demberg-2024-teamsaarlst">
<titleInfo>
<title>TeamSaarLST at the GEM’24 Data-to-text Task: Revisiting symbolic retrieval in the LLM-age</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mayank</namePart>
<namePart type="family">Jobanputra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference: Generation Challenges</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna-Adriana</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data-to-text (D2T) generation is a natural language generation (NLG) task in which a system describes structured data in natural language. Generating natural language verbalization for structured data is challenging as the data may not contain all the required details (here, properties such as gender are missing from the input data and need to be inferred for correct language generation), and because the structured data may conflict with the knowledge contained in the LLM’s parameters learned during pre-training. Both of these factors (incorrect filling in of details, pretraining conflict and input data) can lead to so-called hallucinations. In this paper, we propose a few-shot retrieval augmented generation (RAG) system, using a symbolic retriever – PropertyRetriever. Additionally, we experiment with state-of-the-art large language models (LLMs) to generate data verbalizations. Our system achieves the best results on 4 out of 6 subtasks for METEOR and chrF++ metrics. We present our results along with an error analysis. We release our code for reproducing the results as well as the generated verbalizations from all the experiments for any further explorations here.</abstract>
<identifier type="citekey">jobanputra-demberg-2024-teamsaarlst</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-genchal.10</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>92</start>
<end>99</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TeamSaarLST at the GEM’24 Data-to-text Task: Revisiting symbolic retrieval in the LLM-age
%A Jobanputra, Mayank
%A Demberg, Vera
%Y Mille, Simon
%Y Clinciu, Miruna-Adriana
%S Proceedings of the 17th International Natural Language Generation Conference: Generation Challenges
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F jobanputra-demberg-2024-teamsaarlst
%X Data-to-text (D2T) generation is a natural language generation (NLG) task in which a system describes structured data in natural language. Generating natural language verbalization for structured data is challenging as the data may not contain all the required details (here, properties such as gender are missing from the input data and need to be inferred for correct language generation), and because the structured data may conflict with the knowledge contained in the LLM’s parameters learned during pre-training. Both of these factors (incorrect filling in of details, pretraining conflict and input data) can lead to so-called hallucinations. In this paper, we propose a few-shot retrieval augmented generation (RAG) system, using a symbolic retriever – PropertyRetriever. Additionally, we experiment with state-of-the-art large language models (LLMs) to generate data verbalizations. Our system achieves the best results on 4 out of 6 subtasks for METEOR and chrF++ metrics. We present our results along with an error analysis. We release our code for reproducing the results as well as the generated verbalizations from all the experiments for any further explorations here.
%U https://aclanthology.org/2024.inlg-genchal.10
%P 92-99
Markdown (Informal)
[TeamSaarLST at the GEM’24 Data-to-text Task: Revisiting symbolic retrieval in the LLM-age](https://aclanthology.org/2024.inlg-genchal.10) (Jobanputra & Demberg, INLG 2024)
ACL