@inproceedings{xie-etal-2025-improving,
title = "Improving Model Factuality with Fine-grained Critique-based Evaluator",
author = "Xie, Yiqing and
Zhou, Wenxuan and
Prakash, Pradyot and
Jin, Di and
Mao, Yuning and
Fettes, Quintin and
Talebzadeh, Arya and
Wang, Sinong and
Fang, Han and
Rose, Carolyn and
Fried, Daniel and
Zhang, Hejia",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.400/",
doi = "10.18653/v1/2025.acl-long.400",
pages = "8140--8155",
ISBN = "979-8-89176-251-0",
abstract = "Factuality evaluation aims to detect factual errors produced by language models (LMs) and hence guide the development of more factual models. Towards this goal, we train a factuality evaluator, FenCE, that provides LM generators with claim-level factuality feedback. In particular, we train FenCE to (1) generate textual critiques along with scores and (2) make claim-level judgment based on diverse source documents obtained by various tools, via data augmentation on a combination of public judgment datasets. We then present a framework that leverages FenCE to improve the factuality of LM generators by constructing training data. Specifically, we generate a set of candidate responses, ask FenCE to revise and score each response without introducing lesser-known facts, and train the generator by preferring highly scored revised responses. Experiments show that our data augmentation methods improve the evaluator{'}s accuracy by 2.9{\%} on LLM-AggreFact. With FenCE, we improve Llama2-7B-chat/Llama3-8B-chat{'}s factuality rate by 16.86{\%}/14.45{\%} on FActScore, outperforming state-of-the-art factuality finetuning methods by 8.83{\%}/6.96{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xie-etal-2025-improving">
<titleInfo>
<title>Improving Model Factuality with Fine-grained Critique-based Evaluator</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yiqing</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxuan</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pradyot</namePart>
<namePart type="family">Prakash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Jin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuning</namePart>
<namePart type="family">Mao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Quintin</namePart>
<namePart type="family">Fettes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arya</namePart>
<namePart type="family">Talebzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sinong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Han</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Fried</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hejia</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Factuality evaluation aims to detect factual errors produced by language models (LMs) and hence guide the development of more factual models. Towards this goal, we train a factuality evaluator, FenCE, that provides LM generators with claim-level factuality feedback. In particular, we train FenCE to (1) generate textual critiques along with scores and (2) make claim-level judgment based on diverse source documents obtained by various tools, via data augmentation on a combination of public judgment datasets. We then present a framework that leverages FenCE to improve the factuality of LM generators by constructing training data. Specifically, we generate a set of candidate responses, ask FenCE to revise and score each response without introducing lesser-known facts, and train the generator by preferring highly scored revised responses. Experiments show that our data augmentation methods improve the evaluator’s accuracy by 2.9% on LLM-AggreFact. With FenCE, we improve Llama2-7B-chat/Llama3-8B-chat’s factuality rate by 16.86%/14.45% on FActScore, outperforming state-of-the-art factuality finetuning methods by 8.83%/6.96%.</abstract>
<identifier type="citekey">xie-etal-2025-improving</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.400</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.400/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>8140</start>
<end>8155</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Model Factuality with Fine-grained Critique-based Evaluator
%A Xie, Yiqing
%A Zhou, Wenxuan
%A Prakash, Pradyot
%A Jin, Di
%A Mao, Yuning
%A Fettes, Quintin
%A Talebzadeh, Arya
%A Wang, Sinong
%A Fang, Han
%A Rose, Carolyn
%A Fried, Daniel
%A Zhang, Hejia
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F xie-etal-2025-improving
%X Factuality evaluation aims to detect factual errors produced by language models (LMs) and hence guide the development of more factual models. Towards this goal, we train a factuality evaluator, FenCE, that provides LM generators with claim-level factuality feedback. In particular, we train FenCE to (1) generate textual critiques along with scores and (2) make claim-level judgment based on diverse source documents obtained by various tools, via data augmentation on a combination of public judgment datasets. We then present a framework that leverages FenCE to improve the factuality of LM generators by constructing training data. Specifically, we generate a set of candidate responses, ask FenCE to revise and score each response without introducing lesser-known facts, and train the generator by preferring highly scored revised responses. Experiments show that our data augmentation methods improve the evaluator’s accuracy by 2.9% on LLM-AggreFact. With FenCE, we improve Llama2-7B-chat/Llama3-8B-chat’s factuality rate by 16.86%/14.45% on FActScore, outperforming state-of-the-art factuality finetuning methods by 8.83%/6.96%.
%R 10.18653/v1/2025.acl-long.400
%U https://aclanthology.org/2025.acl-long.400/
%U https://doi.org/10.18653/v1/2025.acl-long.400
%P 8140-8155
Markdown (Informal)
[Improving Model Factuality with Fine-grained Critique-based Evaluator](https://aclanthology.org/2025.acl-long.400/) (Xie et al., ACL 2025)
ACL
- Yiqing Xie, Wenxuan Zhou, Pradyot Prakash, Di Jin, Yuning Mao, Quintin Fettes, Arya Talebzadeh, Sinong Wang, Han Fang, Carolyn Rose, Daniel Fried, and Hejia Zhang. 2025. Improving Model Factuality with Fine-grained Critique-based Evaluator. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8140–8155, Vienna, Austria. Association for Computational Linguistics.