@inproceedings{addlesee-etal-2023-multi,
title = "Multi-party Goal Tracking with {LLM}s: Comparing Pre-training, Fine-tuning, and Prompt Engineering",
author = "Addlesee, Angus and
Siei{\'n}ska, Weronika and
Gunson, Nancie and
Hernandez Garcia, Daniel and
Dondrup, Christian and
Lemon, Oliver",
editor = "Stoyanchev, Svetlana and
Joty, Shafiq and
Schlangen, David and
Dusek, Ondrej and
Kennington, Casey and
Alikhani, Malihe",
booktitle = "Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
month = sep,
year = "2023",
address = "Prague, Czechia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.sigdial-1.22",
doi = "10.18653/v1/2023.sigdial-1.22",
pages = "229--241",
abstract = "This paper evaluates the extent to which current LLMs can capture task-oriented multi-party conversations (MPCs). We have recorded and transcribed 29 MPCs between patients, their companions, and a social robot in a hospital. We then annotated this corpus for multi-party goal-tracking and intent-slot recognition. People share goals, answer each other{'}s goals, and provide other people{'}s goals in MPCs - none of which occur in dyadic interactions. To understand user goals in MPCs, we compared three methods in zero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks to train DialogLM using LED, and employed prompt engineering techniques with GPT-3.5-turbo, to determine which approach can complete this novel task with limited data. GPT-3.5-turbo significantly outperformed the others in a few-shot setting. The {`}reasoning{'} style prompt, when given 7{\%} of the corpus as example annotated conversations, was the best performing method. It correctly annotated 62.32{\%} of the goal tracking MPCs, and 69.57{\%} of the intent-slot recognition MPCs. A {`}story{'} style prompt increased model hallucination, which could be detrimental if deployed in safety-critical settings. We conclude that multi-party conversations still challenge state-of-the-art LLMs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="addlesee-etal-2023-multi">
<titleInfo>
<title>Multi-party Goal Tracking with LLMs: Comparing Pre-training, Fine-tuning, and Prompt Engineering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Angus</namePart>
<namePart type="family">Addlesee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weronika</namePart>
<namePart type="family">Sieińska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nancie</namePart>
<namePart type="family">Gunson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Hernandez Garcia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Dondrup</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oliver</namePart>
<namePart type="family">Lemon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue</title>
</titleInfo>
<name type="personal">
<namePart type="given">Svetlana</namePart>
<namePart type="family">Stoyanchev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shafiq</namePart>
<namePart type="family">Joty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Schlangen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Dusek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Casey</namePart>
<namePart type="family">Kennington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Malihe</namePart>
<namePart type="family">Alikhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Prague, Czechia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper evaluates the extent to which current LLMs can capture task-oriented multi-party conversations (MPCs). We have recorded and transcribed 29 MPCs between patients, their companions, and a social robot in a hospital. We then annotated this corpus for multi-party goal-tracking and intent-slot recognition. People share goals, answer each other’s goals, and provide other people’s goals in MPCs - none of which occur in dyadic interactions. To understand user goals in MPCs, we compared three methods in zero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks to train DialogLM using LED, and employed prompt engineering techniques with GPT-3.5-turbo, to determine which approach can complete this novel task with limited data. GPT-3.5-turbo significantly outperformed the others in a few-shot setting. The ‘reasoning’ style prompt, when given 7% of the corpus as example annotated conversations, was the best performing method. It correctly annotated 62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition MPCs. A ‘story’ style prompt increased model hallucination, which could be detrimental if deployed in safety-critical settings. We conclude that multi-party conversations still challenge state-of-the-art LLMs.</abstract>
<identifier type="citekey">addlesee-etal-2023-multi</identifier>
<identifier type="doi">10.18653/v1/2023.sigdial-1.22</identifier>
<location>
<url>https://aclanthology.org/2023.sigdial-1.22</url>
</location>
<part>
<date>2023-09</date>
<extent unit="page">
<start>229</start>
<end>241</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-party Goal Tracking with LLMs: Comparing Pre-training, Fine-tuning, and Prompt Engineering
%A Addlesee, Angus
%A Sieińska, Weronika
%A Gunson, Nancie
%A Hernandez Garcia, Daniel
%A Dondrup, Christian
%A Lemon, Oliver
%Y Stoyanchev, Svetlana
%Y Joty, Shafiq
%Y Schlangen, David
%Y Dusek, Ondrej
%Y Kennington, Casey
%Y Alikhani, Malihe
%S Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue
%D 2023
%8 September
%I Association for Computational Linguistics
%C Prague, Czechia
%F addlesee-etal-2023-multi
%X This paper evaluates the extent to which current LLMs can capture task-oriented multi-party conversations (MPCs). We have recorded and transcribed 29 MPCs between patients, their companions, and a social robot in a hospital. We then annotated this corpus for multi-party goal-tracking and intent-slot recognition. People share goals, answer each other’s goals, and provide other people’s goals in MPCs - none of which occur in dyadic interactions. To understand user goals in MPCs, we compared three methods in zero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks to train DialogLM using LED, and employed prompt engineering techniques with GPT-3.5-turbo, to determine which approach can complete this novel task with limited data. GPT-3.5-turbo significantly outperformed the others in a few-shot setting. The ‘reasoning’ style prompt, when given 7% of the corpus as example annotated conversations, was the best performing method. It correctly annotated 62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition MPCs. A ‘story’ style prompt increased model hallucination, which could be detrimental if deployed in safety-critical settings. We conclude that multi-party conversations still challenge state-of-the-art LLMs.
%R 10.18653/v1/2023.sigdial-1.22
%U https://aclanthology.org/2023.sigdial-1.22
%U https://doi.org/10.18653/v1/2023.sigdial-1.22
%P 229-241
Markdown (Informal)
[Multi-party Goal Tracking with LLMs: Comparing Pre-training, Fine-tuning, and Prompt Engineering](https://aclanthology.org/2023.sigdial-1.22) (Addlesee et al., SIGDIAL 2023)
ACL
- Angus Addlesee, Weronika Sieińska, Nancie Gunson, Daniel Hernandez Garcia, Christian Dondrup, and Oliver Lemon. 2023. Multi-party Goal Tracking with LLMs: Comparing Pre-training, Fine-tuning, and Prompt Engineering. In Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue, pages 229–241, Prague, Czechia. Association for Computational Linguistics.