@inproceedings{zhang-etal-2026-opengenalign,
title = "{O}pen{G}en{A}lign: A Preference Dataset and Benchmark for Trustworthy Reward Modeling in Open-Ended, Long-Context Generation",
author = "Zhang, Hanning and
Song, Juntong and
Zhu, Juno and
Wu, Yuanhao and
Zhang, Tong and
Niu, Cheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.553/",
doi = "10.18653/v1/2026.findings-acl.553",
pages = "11373--11392",
ISBN = "979-8-89176-395-1",
abstract = "Reward Modeling is critical in evaluating and improving the generation of Large Language Models (LLMs). {\%} They are typically trained on high-quality annotated datasets to distinguish between better and worse responses. While numerous recent works have shown its feasibility in improving safety, helpfulness, reasoning, and instruction-following ability, its capability and generalization to open-ended long-context generation is still rarely explored.In this paper, we introduce OpenGenAlign, a framework and a high-quality dataset designed to develop reward models to evaluate and improve hallucination-free, comprehensive, reliable, and efficient open-ended long-context generation. We define four key metrics to assess generation quality and develop an automated pipeline to evaluate the outputs of multiple LLMs across long-context QA, Data-to-Text, and Summarization scenarios using o3, ending up with 33K high-quality preference data with a human agreement rate of 81{\%}. Experimental results first demonstrate that existing reward models perform suboptimally on the held-out benchmark. And Our trained reward model achieves superior performance in the benchmark and effectively improves the generation quality of the policy models using Reinforcement Learning (RL). Additionally, OpenGenAlign could be used for effective guided generation in existing datasets. Furthermore, we demonstrate that the OpenGenAlign could be integrated with reward data from other domains to achieve better performance."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-opengenalign">
<titleInfo>
<title>OpenGenAlign: A Preference Dataset and Benchmark for Trustworthy Reward Modeling in Open-Ended, Long-Context Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hanning</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juntong</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juno</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuanhao</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Niu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Reward Modeling is critical in evaluating and improving the generation of Large Language Models (LLMs). % They are typically trained on high-quality annotated datasets to distinguish between better and worse responses. While numerous recent works have shown its feasibility in improving safety, helpfulness, reasoning, and instruction-following ability, its capability and generalization to open-ended long-context generation is still rarely explored.In this paper, we introduce OpenGenAlign, a framework and a high-quality dataset designed to develop reward models to evaluate and improve hallucination-free, comprehensive, reliable, and efficient open-ended long-context generation. We define four key metrics to assess generation quality and develop an automated pipeline to evaluate the outputs of multiple LLMs across long-context QA, Data-to-Text, and Summarization scenarios using o3, ending up with 33K high-quality preference data with a human agreement rate of 81%. Experimental results first demonstrate that existing reward models perform suboptimally on the held-out benchmark. And Our trained reward model achieves superior performance in the benchmark and effectively improves the generation quality of the policy models using Reinforcement Learning (RL). Additionally, OpenGenAlign could be used for effective guided generation in existing datasets. Furthermore, we demonstrate that the OpenGenAlign could be integrated with reward data from other domains to achieve better performance.</abstract>
<identifier type="citekey">zhang-etal-2026-opengenalign</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.553</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.553/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>11373</start>
<end>11392</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T OpenGenAlign: A Preference Dataset and Benchmark for Trustworthy Reward Modeling in Open-Ended, Long-Context Generation
%A Zhang, Hanning
%A Song, Juntong
%A Zhu, Juno
%A Wu, Yuanhao
%A Zhang, Tong
%A Niu, Cheng
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhang-etal-2026-opengenalign
%X Reward Modeling is critical in evaluating and improving the generation of Large Language Models (LLMs). % They are typically trained on high-quality annotated datasets to distinguish between better and worse responses. While numerous recent works have shown its feasibility in improving safety, helpfulness, reasoning, and instruction-following ability, its capability and generalization to open-ended long-context generation is still rarely explored.In this paper, we introduce OpenGenAlign, a framework and a high-quality dataset designed to develop reward models to evaluate and improve hallucination-free, comprehensive, reliable, and efficient open-ended long-context generation. We define four key metrics to assess generation quality and develop an automated pipeline to evaluate the outputs of multiple LLMs across long-context QA, Data-to-Text, and Summarization scenarios using o3, ending up with 33K high-quality preference data with a human agreement rate of 81%. Experimental results first demonstrate that existing reward models perform suboptimally on the held-out benchmark. And Our trained reward model achieves superior performance in the benchmark and effectively improves the generation quality of the policy models using Reinforcement Learning (RL). Additionally, OpenGenAlign could be used for effective guided generation in existing datasets. Furthermore, we demonstrate that the OpenGenAlign could be integrated with reward data from other domains to achieve better performance.
%R 10.18653/v1/2026.findings-acl.553
%U https://aclanthology.org/2026.findings-acl.553/
%U https://doi.org/10.18653/v1/2026.findings-acl.553
%P 11373-11392
Markdown (Informal)
[OpenGenAlign: A Preference Dataset and Benchmark for Trustworthy Reward Modeling in Open-Ended, Long-Context Generation](https://aclanthology.org/2026.findings-acl.553/) (Zhang et al., Findings 2026)
ACL
- Hanning Zhang, Juntong Song, Juno Zhu, Yuanhao Wu, Tong Zhang, and Cheng Niu. 2026. OpenGenAlign: A Preference Dataset and Benchmark for Trustworthy Reward Modeling in Open-Ended, Long-Context Generation. In Findings of the Association for Computational Linguistics: ACL 2026, pages 11373–11392, San Diego, California, United States. Association for Computational Linguistics.