@inproceedings{wang-etal-2026-evolvr,
title = "{E}volv{R}: Self-Evolving Pairwise Reasoning for Story Evaluation to Enhance Generation",
author = "Wang, Xinda and
Hou, Zhengxu and
Zhang, Yangshijie and
Yanbingren and
Liu, Jialin and
Zhao, ChenZhuo and
Yang, Zhibo and
Yang, Bin-Bin and
Xiao, Feng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.878/",
pages = "19208--19241",
ISBN = "979-8-89176-390-6",
abstract = "Although the effectiveness of Large Language Models as judges has been validated, their performance remains limited in open-ended tasks, particularly in story evaluation. Accurate story evaluation is crucial not only for assisting human quality judgment but also for providing reward signals to guide story generation. However, existing methods face a dilemma: prompt engineering for closed-source models suffers from poor adaptability, while fine-tuning approaches for open-source models lack the reasoning capabilities essential for story evaluation. To address this, we propose the Self-Evolving Pairwise Reasoning (EvolvR) framework. Grounded in pairwise comparison, the framework first self-synthesizes score-aligned Chain-of-Thought (CoT) data via a multi-persona strategy. To ensure data quality, these raw CoTs undergo a self-filtering process, utilizing multi-agents to guarantee their logical rigor and robustness. Finally, the evaluator trained on the refined data is deployed as a reward model to guide the story generation task. Experimental results demonstrate that our framework achieves state-of-the-art performance on three evaluation benchmarks including StoryER, HANNA and OpenMEVA. Furthermore, when served as a reward model, it enhances the quality of generated stories, thereby validating the superiority of our self-evolving approach."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-evolvr">
<titleInfo>
<title>EvolvR: Self-Evolving Pairwise Reasoning for Story Evaluation to Enhance Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xinda</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengxu</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yangshijie</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name>
<namePart>Yanbingren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jialin</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">ChenZhuo</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhibo</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin-Bin</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Feng</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>Although the effectiveness of Large Language Models as judges has been validated, their performance remains limited in open-ended tasks, particularly in story evaluation. Accurate story evaluation is crucial not only for assisting human quality judgment but also for providing reward signals to guide story generation. However, existing methods face a dilemma: prompt engineering for closed-source models suffers from poor adaptability, while fine-tuning approaches for open-source models lack the reasoning capabilities essential for story evaluation. To address this, we propose the Self-Evolving Pairwise Reasoning (EvolvR) framework. Grounded in pairwise comparison, the framework first self-synthesizes score-aligned Chain-of-Thought (CoT) data via a multi-persona strategy. To ensure data quality, these raw CoTs undergo a self-filtering process, utilizing multi-agents to guarantee their logical rigor and robustness. Finally, the evaluator trained on the refined data is deployed as a reward model to guide the story generation task. Experimental results demonstrate that our framework achieves state-of-the-art performance on three evaluation benchmarks including StoryER, HANNA and OpenMEVA. Furthermore, when served as a reward model, it enhances the quality of generated stories, thereby validating the superiority of our self-evolving approach.</abstract>
<identifier type="citekey">wang-etal-2026-evolvr</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.878/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>19208</start>
<end>19241</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T EvolvR: Self-Evolving Pairwise Reasoning for Story Evaluation to Enhance Generation
%A Wang, Xinda
%A Hou, Zhengxu
%A Zhang, Yangshijie
%A Liu, Jialin
%A Zhao, ChenZhuo
%A Yang, Zhibo
%A Yang, Bin-Bin
%A Xiao, Feng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%A Yanbingren
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F wang-etal-2026-evolvr
%X Although the effectiveness of Large Language Models as judges has been validated, their performance remains limited in open-ended tasks, particularly in story evaluation. Accurate story evaluation is crucial not only for assisting human quality judgment but also for providing reward signals to guide story generation. However, existing methods face a dilemma: prompt engineering for closed-source models suffers from poor adaptability, while fine-tuning approaches for open-source models lack the reasoning capabilities essential for story evaluation. To address this, we propose the Self-Evolving Pairwise Reasoning (EvolvR) framework. Grounded in pairwise comparison, the framework first self-synthesizes score-aligned Chain-of-Thought (CoT) data via a multi-persona strategy. To ensure data quality, these raw CoTs undergo a self-filtering process, utilizing multi-agents to guarantee their logical rigor and robustness. Finally, the evaluator trained on the refined data is deployed as a reward model to guide the story generation task. Experimental results demonstrate that our framework achieves state-of-the-art performance on three evaluation benchmarks including StoryER, HANNA and OpenMEVA. Furthermore, when served as a reward model, it enhances the quality of generated stories, thereby validating the superiority of our self-evolving approach.
%U https://aclanthology.org/2026.acl-long.878/
%P 19208-19241
Markdown (Informal)
[EvolvR: Self-Evolving Pairwise Reasoning for Story Evaluation to Enhance Generation](https://aclanthology.org/2026.acl-long.878/) (Wang et al., ACL 2026)
ACL
- Xinda Wang, Zhengxu Hou, Yangshijie Zhang, Yanbingren, Jialin Liu, ChenZhuo Zhao, Zhibo Yang, Bin-Bin Yang, and Feng Xiao. 2026. EvolvR: Self-Evolving Pairwise Reasoning for Story Evaluation to Enhance Generation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 19208–19241, San Diego, California, United States. Association for Computational Linguistics.