@inproceedings{lemesle-etal-2026-pluie,
title = "*-{PLUIE}: Personalisable metric with Llm Used for Improved Evaluation",
author = "Lemesle, Quentin and
Jourdan, Leane and
Munson, Daisy and
Alain, Pierre and
Chevelu, Jonathan and
Delhay, Arnaud and
Lolive, Damien",
editor = "Mohammad, Saif M. and
Ousidhoum, Nedjma",
booktitle = "Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*{SEM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.starsem-conference.14/",
pages = "211--243",
ISBN = "979-8-89176-413-2",
abstract = "Evaluating the quality of automatically generated text often relies on LLM-as-a-judge (LLM-judge) methods. While effective, these approaches are computationally expensive and require post-processing. To address these limitations, we build upon ParaPLUIE, a perplexity-based LLM-judge metric that estimates confidence over ``Yes/No'' answers without generating text. We introduce *-PLUIE, task-specific prompting variants of ParaPLUIE and evaluate their alignment with human judgement. Our experiments show that personalised *-PLUIE achieves stronger correlations with human ratings while maintaining low computational cost."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lemesle-etal-2026-pluie">
<titleInfo>
<title>*-PLUIE: Personalisable metric with Llm Used for Improved Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Quentin</namePart>
<namePart type="family">Lemesle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leane</namePart>
<namePart type="family">Jourdan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daisy</namePart>
<namePart type="family">Munson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Alain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Chevelu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arnaud</namePart>
<namePart type="family">Delhay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damien</namePart>
<namePart type="family">Lolive</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saif</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Mohammad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nedjma</namePart>
<namePart type="family">Ousidhoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-413-2</identifier>
</relatedItem>
<abstract>Evaluating the quality of automatically generated text often relies on LLM-as-a-judge (LLM-judge) methods. While effective, these approaches are computationally expensive and require post-processing. To address these limitations, we build upon ParaPLUIE, a perplexity-based LLM-judge metric that estimates confidence over “Yes/No” answers without generating text. We introduce *-PLUIE, task-specific prompting variants of ParaPLUIE and evaluate their alignment with human judgement. Our experiments show that personalised *-PLUIE achieves stronger correlations with human ratings while maintaining low computational cost.</abstract>
<identifier type="citekey">lemesle-etal-2026-pluie</identifier>
<location>
<url>https://aclanthology.org/2026.starsem-conference.14/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>211</start>
<end>243</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T *-PLUIE: Personalisable metric with Llm Used for Improved Evaluation
%A Lemesle, Quentin
%A Jourdan, Leane
%A Munson, Daisy
%A Alain, Pierre
%A Chevelu, Jonathan
%A Delhay, Arnaud
%A Lolive, Damien
%Y Mohammad, Saif M.
%Y Ousidhoum, Nedjma
%S Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-413-2
%F lemesle-etal-2026-pluie
%X Evaluating the quality of automatically generated text often relies on LLM-as-a-judge (LLM-judge) methods. While effective, these approaches are computationally expensive and require post-processing. To address these limitations, we build upon ParaPLUIE, a perplexity-based LLM-judge metric that estimates confidence over “Yes/No” answers without generating text. We introduce *-PLUIE, task-specific prompting variants of ParaPLUIE and evaluate their alignment with human judgement. Our experiments show that personalised *-PLUIE achieves stronger correlations with human ratings while maintaining low computational cost.
%U https://aclanthology.org/2026.starsem-conference.14/
%P 211-243
Markdown (Informal)
[*-PLUIE: Personalisable metric with Llm Used for Improved Evaluation](https://aclanthology.org/2026.starsem-conference.14/) (Lemesle et al., *SEM 2026)
ACL
- Quentin Lemesle, Leane Jourdan, Daisy Munson, Pierre Alain, Jonathan Chevelu, Arnaud Delhay, and Damien Lolive. 2026. *-PLUIE: Personalisable metric with Llm Used for Improved Evaluation. In Proceedings of the 15th Joint Conference on Lexical and Computational Semantics (*SEM 2026), pages 211–243, San Diego, California, United States. Association for Computational Linguistics.