@inproceedings{mamta-cocarascu-2025-facteval,
title = "{F}act{E}val: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models",
author = "Mamta and
Cocarascu, Oana",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.534/",
doi = "10.18653/v1/2025.naacl-long.534",
pages = "10647--10660",
ISBN = "979-8-89176-189-6",
abstract = "Whilst large language models (LLMs) have made significant advances in every natural language processing task, studies have shown that these models are vulnerable to small perturbations in the inputs, raising concerns about their robustness in the real-world. Given the rise of misinformation online and its significant impact on society, fact verification is one area in which assessing the robustness of models developed for this task is crucial. However, the robustness of LLMs in fact verification remains largely unexplored. In this paper, we introduce FactEval, a novel large-scale benchmark for extensive evaluation of LLMs in the fact verification domain covering 17 realistic word-level and character-level perturbations and 4 types of subpopulations. We investigate the robustness of several LLMs in zero-shot, few-shot, and chain-of-thought prompting. Our analysis using FEVER, one of the largest and most widely-used datasets for fact verification, reveals that LLMs are brittle to small input changes and also exhibit performance variations across different subpopulations."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mamta-cocarascu-2025-facteval">
<titleInfo>
<title>FactEval: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models</title>
</titleInfo>
<name>
<namePart>Mamta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oana</namePart>
<namePart type="family">Cocarascu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Whilst large language models (LLMs) have made significant advances in every natural language processing task, studies have shown that these models are vulnerable to small perturbations in the inputs, raising concerns about their robustness in the real-world. Given the rise of misinformation online and its significant impact on society, fact verification is one area in which assessing the robustness of models developed for this task is crucial. However, the robustness of LLMs in fact verification remains largely unexplored. In this paper, we introduce FactEval, a novel large-scale benchmark for extensive evaluation of LLMs in the fact verification domain covering 17 realistic word-level and character-level perturbations and 4 types of subpopulations. We investigate the robustness of several LLMs in zero-shot, few-shot, and chain-of-thought prompting. Our analysis using FEVER, one of the largest and most widely-used datasets for fact verification, reveals that LLMs are brittle to small input changes and also exhibit performance variations across different subpopulations.</abstract>
<identifier type="citekey">mamta-cocarascu-2025-facteval</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.534</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.534/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>10647</start>
<end>10660</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FactEval: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models
%A Cocarascu, Oana
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%A Mamta
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F mamta-cocarascu-2025-facteval
%X Whilst large language models (LLMs) have made significant advances in every natural language processing task, studies have shown that these models are vulnerable to small perturbations in the inputs, raising concerns about their robustness in the real-world. Given the rise of misinformation online and its significant impact on society, fact verification is one area in which assessing the robustness of models developed for this task is crucial. However, the robustness of LLMs in fact verification remains largely unexplored. In this paper, we introduce FactEval, a novel large-scale benchmark for extensive evaluation of LLMs in the fact verification domain covering 17 realistic word-level and character-level perturbations and 4 types of subpopulations. We investigate the robustness of several LLMs in zero-shot, few-shot, and chain-of-thought prompting. Our analysis using FEVER, one of the largest and most widely-used datasets for fact verification, reveals that LLMs are brittle to small input changes and also exhibit performance variations across different subpopulations.
%R 10.18653/v1/2025.naacl-long.534
%U https://aclanthology.org/2025.naacl-long.534/
%U https://doi.org/10.18653/v1/2025.naacl-long.534
%P 10647-10660
Markdown (Informal)
[FactEval: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models](https://aclanthology.org/2025.naacl-long.534/) (Mamta & Cocarascu, NAACL 2025)
ACL