@inproceedings{chen-etal-2026-peercheck,
title = "$PeerCheck$: Enhancing {LLM}-Generated Academic Reviews Towards Human-Level Quality",
author = "Chen, Zeyuan and
Yang, Ziqing and
Ma, Yihan and
Backes, Michael and
Zhang, Yang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1170/",
pages = "23362--23386",
ISBN = "979-8-89176-395-1",
abstract = "As academic submissions grow, the traditional peer review process struggles to keep up, raising concerns about quality and fairness.A trend of using large language models (LLMs) for assistance has emerged.In this work, we take a critical step toward improving the quality of LLM-generated reviews.We propose the $PeerCheck$ framework, which investigates LLM-human review differences ($\textbf{RQ1}$) and explores methods to increase LLM-human similarity ($\textbf{RQ2}$).We first analyzed the human-written reviews with reviews generated by GPT-4o, Claude-3.7-Sonnet, and DeepSeek-V3 and found that LLMs and humans focus on different terms, e.g., LLMs prioritize theory while humans emphasize methodology and experiments.We further adopt prompt engineering, such as Chain-of-Thought (CoT), and utilize retrieval-augmented generation (RAG) to enhance the LLM-generated reviews towards human-level quality.We find CoT significantly improves the human similarity of LLM reviews, while we also discover an unexpected ``RAG paradox,'' i.e., experiments with RAG produce different results for various LLMs and, in some cases, even reduce review quality.Our comprehensive analysis of LLM-generated academic reviews illustrates both possibilities and limitations, contributing to a more effective, human-aligned review system."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2026-peercheck">
<titleInfo>
<title>PeerCheck: Enhancing LLM-Generated Academic Reviews Towards Human-Level Quality</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zeyuan</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziqing</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihan</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Backes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>As academic submissions grow, the traditional peer review process struggles to keep up, raising concerns about quality and fairness.A trend of using large language models (LLMs) for assistance has emerged.In this work, we take a critical step toward improving the quality of LLM-generated reviews.We propose the PeerCheck framework, which investigates LLM-human review differences (RQ1) and explores methods to increase LLM-human similarity (RQ2).We first analyzed the human-written reviews with reviews generated by GPT-4o, Claude-3.7-Sonnet, and DeepSeek-V3 and found that LLMs and humans focus on different terms, e.g., LLMs prioritize theory while humans emphasize methodology and experiments.We further adopt prompt engineering, such as Chain-of-Thought (CoT), and utilize retrieval-augmented generation (RAG) to enhance the LLM-generated reviews towards human-level quality.We find CoT significantly improves the human similarity of LLM reviews, while we also discover an unexpected “RAG paradox,” i.e., experiments with RAG produce different results for various LLMs and, in some cases, even reduce review quality.Our comprehensive analysis of LLM-generated academic reviews illustrates both possibilities and limitations, contributing to a more effective, human-aligned review system.</abstract>
<identifier type="citekey">chen-etal-2026-peercheck</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1170/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>23362</start>
<end>23386</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PeerCheck: Enhancing LLM-Generated Academic Reviews Towards Human-Level Quality
%A Chen, Zeyuan
%A Yang, Ziqing
%A Ma, Yihan
%A Backes, Michael
%A Zhang, Yang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F chen-etal-2026-peercheck
%X As academic submissions grow, the traditional peer review process struggles to keep up, raising concerns about quality and fairness.A trend of using large language models (LLMs) for assistance has emerged.In this work, we take a critical step toward improving the quality of LLM-generated reviews.We propose the PeerCheck framework, which investigates LLM-human review differences (RQ1) and explores methods to increase LLM-human similarity (RQ2).We first analyzed the human-written reviews with reviews generated by GPT-4o, Claude-3.7-Sonnet, and DeepSeek-V3 and found that LLMs and humans focus on different terms, e.g., LLMs prioritize theory while humans emphasize methodology and experiments.We further adopt prompt engineering, such as Chain-of-Thought (CoT), and utilize retrieval-augmented generation (RAG) to enhance the LLM-generated reviews towards human-level quality.We find CoT significantly improves the human similarity of LLM reviews, while we also discover an unexpected “RAG paradox,” i.e., experiments with RAG produce different results for various LLMs and, in some cases, even reduce review quality.Our comprehensive analysis of LLM-generated academic reviews illustrates both possibilities and limitations, contributing to a more effective, human-aligned review system.
%U https://aclanthology.org/2026.findings-acl.1170/
%P 23362-23386
Markdown (Informal)
[PeerCheck: Enhancing LLM-Generated Academic Reviews Towards Human-Level Quality](https://aclanthology.org/2026.findings-acl.1170/) (Chen et al., Findings 2026)
ACL