@inproceedings{agrahari-etal-2025-really,
title = "Can You Really Trust That Review? {P}roto{F}ew{R}o{BERT}a and {D}etect{AIR}ev: A Prototypical Few-Shot Method and Multi-Domain Benchmark for Detecting {AI}-Generated Reviews",
author = "Agrahari, Shifali and
Kumar, Sujit and
Sanasam, Ranbir Singh",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-ijcnlp.132/",
pages = "2118--2140",
ISBN = "979-8-89176-303-6",
abstract = "Synthetic reviews mislead users and erode trust in online marketplaces, and the advent of Large Language Models (LLMs) makes detecting such AI-generated content increasingly challenging due to their human-like fluency and coherence. In the literature, LLM-generated review detection datasets are limited to one or a few domains, with reviews generated by only a few LLMs. Consequently, datasets are limited in diversity in terms of both domain coverage and review generation styles. Models trained on such datasets generalize poorly, lacking cross-model adaptation and struggling to detect diverse LLM-generated reviews in real-world, open-domain scenarios. To address this, we introduce DetectAIRev, a benchmark dataset for AI-generated review detection that includes human-written reviews from diverse domains and AI-generated reviews generated by various categories of LLMs. We evaluate the quality and reliability of the proposed dataset through several ablation studies and human evaluations. Furthermore, we propose an AI-generated text detection method ProtoFewRoBERTa, a few-shot framework that combines prototypical networks with RoBERTa embeddings, which learn discriminative features across multiple LLMs and human-written text using only a few labeled examples per class to discriminate between LLMs as the author for text author detection. We conduct our experiments on the DetectAIRev and a publicly available benchmark dataset. Our experimental results suggest that our proposed methods outperform the state-of-the-art baseline models in detecting AI-generated reviews and text detection."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="agrahari-etal-2025-really">
<titleInfo>
<title>Can You Really Trust That Review? ProtoFewRoBERTa and DetectAIRev: A Prototypical Few-Shot Method and Multi-Domain Benchmark for Detecting AI-Generated Reviews</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shifali</namePart>
<namePart type="family">Agrahari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujit</namePart>
<namePart type="family">Kumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ranbir</namePart>
<namePart type="given">Singh</namePart>
<namePart type="family">Sanasam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haofen</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derek</namePart>
<namePart type="given">F</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Biplab</namePart>
<namePart type="family">Banerjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asif</namePart>
<namePart type="family">Ekbal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhirendra</namePart>
<namePart type="given">Pratap</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Asian Federation of Natural Language Processing and The Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-303-6</identifier>
</relatedItem>
<abstract>Synthetic reviews mislead users and erode trust in online marketplaces, and the advent of Large Language Models (LLMs) makes detecting such AI-generated content increasingly challenging due to their human-like fluency and coherence. In the literature, LLM-generated review detection datasets are limited to one or a few domains, with reviews generated by only a few LLMs. Consequently, datasets are limited in diversity in terms of both domain coverage and review generation styles. Models trained on such datasets generalize poorly, lacking cross-model adaptation and struggling to detect diverse LLM-generated reviews in real-world, open-domain scenarios. To address this, we introduce DetectAIRev, a benchmark dataset for AI-generated review detection that includes human-written reviews from diverse domains and AI-generated reviews generated by various categories of LLMs. We evaluate the quality and reliability of the proposed dataset through several ablation studies and human evaluations. Furthermore, we propose an AI-generated text detection method ProtoFewRoBERTa, a few-shot framework that combines prototypical networks with RoBERTa embeddings, which learn discriminative features across multiple LLMs and human-written text using only a few labeled examples per class to discriminate between LLMs as the author for text author detection. We conduct our experiments on the DetectAIRev and a publicly available benchmark dataset. Our experimental results suggest that our proposed methods outperform the state-of-the-art baseline models in detecting AI-generated reviews and text detection.</abstract>
<identifier type="citekey">agrahari-etal-2025-really</identifier>
<location>
<url>https://aclanthology.org/2025.findings-ijcnlp.132/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>2118</start>
<end>2140</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can You Really Trust That Review? ProtoFewRoBERTa and DetectAIRev: A Prototypical Few-Shot Method and Multi-Domain Benchmark for Detecting AI-Generated Reviews
%A Agrahari, Shifali
%A Kumar, Sujit
%A Sanasam, Ranbir Singh
%Y Inui, Kentaro
%Y Sakti, Sakriani
%Y Wang, Haofen
%Y Wong, Derek F.
%Y Bhattacharyya, Pushpak
%Y Banerjee, Biplab
%Y Ekbal, Asif
%Y Chakraborty, Tanmoy
%Y Singh, Dhirendra Pratap
%S Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I The Asian Federation of Natural Language Processing and The Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-303-6
%F agrahari-etal-2025-really
%X Synthetic reviews mislead users and erode trust in online marketplaces, and the advent of Large Language Models (LLMs) makes detecting such AI-generated content increasingly challenging due to their human-like fluency and coherence. In the literature, LLM-generated review detection datasets are limited to one or a few domains, with reviews generated by only a few LLMs. Consequently, datasets are limited in diversity in terms of both domain coverage and review generation styles. Models trained on such datasets generalize poorly, lacking cross-model adaptation and struggling to detect diverse LLM-generated reviews in real-world, open-domain scenarios. To address this, we introduce DetectAIRev, a benchmark dataset for AI-generated review detection that includes human-written reviews from diverse domains and AI-generated reviews generated by various categories of LLMs. We evaluate the quality and reliability of the proposed dataset through several ablation studies and human evaluations. Furthermore, we propose an AI-generated text detection method ProtoFewRoBERTa, a few-shot framework that combines prototypical networks with RoBERTa embeddings, which learn discriminative features across multiple LLMs and human-written text using only a few labeled examples per class to discriminate between LLMs as the author for text author detection. We conduct our experiments on the DetectAIRev and a publicly available benchmark dataset. Our experimental results suggest that our proposed methods outperform the state-of-the-art baseline models in detecting AI-generated reviews and text detection.
%U https://aclanthology.org/2025.findings-ijcnlp.132/
%P 2118-2140
Markdown (Informal)
[Can You Really Trust That Review? ProtoFewRoBERTa and DetectAIRev: A Prototypical Few-Shot Method and Multi-Domain Benchmark for Detecting AI-Generated Reviews](https://aclanthology.org/2025.findings-ijcnlp.132/) (Agrahari et al., Findings 2025)
ACL