@inproceedings{majer-snajder-2024-claim,
title = "Claim Check-Worthiness Detection: How Well do {LLM}s Grasp Annotation Guidelines?",
author = "Majer, Laura and
{\v{S}}najder, Jan",
editor = "Schlichtkrull, Michael and
Chen, Yulong and
Whitehouse, Chenxi and
Deng, Zhenyun and
Akhtar, Mubashara and
Aly, Rami and
Guo, Zhijiang and
Christodoulopoulos, Christos and
Cocarascu, Oana and
Mittal, Arpit and
Thorne, James and
Vlachos, Andreas",
booktitle = "Proceedings of the Seventh Fact Extraction and VERification Workshop (FEVER)",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.fever-1.27",
pages = "245--263",
abstract = "The rising threat of disinformation underscores the need to fully or partially automate the fact-checking process. Identifying text segments requiring fact-checking is known as claim detection (CD) and claim check-worthiness detection (CW), the latter incorporating complex domain-specific criteria of worthiness and often framed as a ranking task. Zero- and few-shot LLM prompting is an attractive option for both tasks, as it bypasses the need for labeled datasets and allows verbalized claim and worthiness criteria to be directly used for prompting. We evaluate the LLMs{'} predictive accuracy on five CD/CW datasets from diverse domains, using corresponding annotation guidelines in prompts. We examine two key aspects: (1) how to best distill factuality and worthiness criteria into a prompt, and (2) how much context to provide for each claim. To this end, we experiment with different levels of prompt verbosity and varying amounts of contextual information given to the model. We additionally evaluate the top-performing models with ranking metrics, resembling prioritization done by fact-checkers. Our results show that optimal prompt verbosity varies, meta-data alone adds more performance boost than co-text, and confidence scores can be directly used to produce reliable check-worthiness rankings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="majer-snajder-2024-claim">
<titleInfo>
<title>Claim Check-Worthiness Detection: How Well do LLMs Grasp Annotation Guidelines?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Laura</namePart>
<namePart type="family">Majer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Šnajder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Fact Extraction and VERification Workshop (FEVER)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Schlichtkrull</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulong</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenxi</namePart>
<namePart type="family">Whitehouse</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenyun</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mubashara</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rami</namePart>
<namePart type="family">Aly</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhijiang</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Cocarascu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arpit</namePart>
<namePart type="family">Mittal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Thorne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The rising threat of disinformation underscores the need to fully or partially automate the fact-checking process. Identifying text segments requiring fact-checking is known as claim detection (CD) and claim check-worthiness detection (CW), the latter incorporating complex domain-specific criteria of worthiness and often framed as a ranking task. Zero- and few-shot LLM prompting is an attractive option for both tasks, as it bypasses the need for labeled datasets and allows verbalized claim and worthiness criteria to be directly used for prompting. We evaluate the LLMs’ predictive accuracy on five CD/CW datasets from diverse domains, using corresponding annotation guidelines in prompts. We examine two key aspects: (1) how to best distill factuality and worthiness criteria into a prompt, and (2) how much context to provide for each claim. To this end, we experiment with different levels of prompt verbosity and varying amounts of contextual information given to the model. We additionally evaluate the top-performing models with ranking metrics, resembling prioritization done by fact-checkers. Our results show that optimal prompt verbosity varies, meta-data alone adds more performance boost than co-text, and confidence scores can be directly used to produce reliable check-worthiness rankings.</abstract>
<identifier type="citekey">majer-snajder-2024-claim</identifier>
<location>
<url>https://aclanthology.org/2024.fever-1.27</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>245</start>
<end>263</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Claim Check-Worthiness Detection: How Well do LLMs Grasp Annotation Guidelines?
%A Majer, Laura
%A Šnajder, Jan
%Y Schlichtkrull, Michael
%Y Chen, Yulong
%Y Whitehouse, Chenxi
%Y Deng, Zhenyun
%Y Akhtar, Mubashara
%Y Aly, Rami
%Y Guo, Zhijiang
%Y Christodoulopoulos, Christos
%Y Cocarascu, Oana
%Y Mittal, Arpit
%Y Thorne, James
%Y Vlachos, Andreas
%S Proceedings of the Seventh Fact Extraction and VERification Workshop (FEVER)
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F majer-snajder-2024-claim
%X The rising threat of disinformation underscores the need to fully or partially automate the fact-checking process. Identifying text segments requiring fact-checking is known as claim detection (CD) and claim check-worthiness detection (CW), the latter incorporating complex domain-specific criteria of worthiness and often framed as a ranking task. Zero- and few-shot LLM prompting is an attractive option for both tasks, as it bypasses the need for labeled datasets and allows verbalized claim and worthiness criteria to be directly used for prompting. We evaluate the LLMs’ predictive accuracy on five CD/CW datasets from diverse domains, using corresponding annotation guidelines in prompts. We examine two key aspects: (1) how to best distill factuality and worthiness criteria into a prompt, and (2) how much context to provide for each claim. To this end, we experiment with different levels of prompt verbosity and varying amounts of contextual information given to the model. We additionally evaluate the top-performing models with ranking metrics, resembling prioritization done by fact-checkers. Our results show that optimal prompt verbosity varies, meta-data alone adds more performance boost than co-text, and confidence scores can be directly used to produce reliable check-worthiness rankings.
%U https://aclanthology.org/2024.fever-1.27
%P 245-263
Markdown (Informal)
[Claim Check-Worthiness Detection: How Well do LLMs Grasp Annotation Guidelines?](https://aclanthology.org/2024.fever-1.27) (Majer & Šnajder, FEVER 2024)
ACL