@inproceedings{saaro-etal-2026-classify,
title = "Do {NOT} Classify and Count: Hybrid Attribute Control Success Evaluation",
author = {Saaro, Felix Matthias and
von D{\"a}niken, Pius and
Cieliebak, Mark and
Deriu, Jan Milan},
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-long.48/",
pages = "1101--1114",
ISBN = "979-8-89176-380-7",
abstract = "Evaluating attribute control success in controllable text generation and related generation tasks typically relies on pretrained classifiers. We show that this widely used classify-and-count approach yields biased and inconsistent results, with estimates varying significantly across classifiers. We frame control success estimation as a quantification task and apply a hybrid Bayesian method that combines classifier predictions with a small number of human labels for calibration. To test our approach, we collected a two-modality test dataset consisting of 600 human-rated samples and 60,000 automatically rated samples. Our experiments show that our approach produces robust estimates of control success across both text and text-to-image generation tasks, offering a principled alternative to current evaluation practices."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saaro-etal-2026-classify">
<titleInfo>
<title>Do NOT Classify and Count: Hybrid Attribute Control Success Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Felix</namePart>
<namePart type="given">Matthias</namePart>
<namePart type="family">Saaro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pius</namePart>
<namePart type="family">von Däniken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Cieliebak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="given">Milan</namePart>
<namePart type="family">Deriu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-380-7</identifier>
</relatedItem>
<abstract>Evaluating attribute control success in controllable text generation and related generation tasks typically relies on pretrained classifiers. We show that this widely used classify-and-count approach yields biased and inconsistent results, with estimates varying significantly across classifiers. We frame control success estimation as a quantification task and apply a hybrid Bayesian method that combines classifier predictions with a small number of human labels for calibration. To test our approach, we collected a two-modality test dataset consisting of 600 human-rated samples and 60,000 automatically rated samples. Our experiments show that our approach produces robust estimates of control success across both text and text-to-image generation tasks, offering a principled alternative to current evaluation practices.</abstract>
<identifier type="citekey">saaro-etal-2026-classify</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-long.48/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1101</start>
<end>1114</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do NOT Classify and Count: Hybrid Attribute Control Success Evaluation
%A Saaro, Felix Matthias
%A von Däniken, Pius
%A Cieliebak, Mark
%A Deriu, Jan Milan
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-380-7
%F saaro-etal-2026-classify
%X Evaluating attribute control success in controllable text generation and related generation tasks typically relies on pretrained classifiers. We show that this widely used classify-and-count approach yields biased and inconsistent results, with estimates varying significantly across classifiers. We frame control success estimation as a quantification task and apply a hybrid Bayesian method that combines classifier predictions with a small number of human labels for calibration. To test our approach, we collected a two-modality test dataset consisting of 600 human-rated samples and 60,000 automatically rated samples. Our experiments show that our approach produces robust estimates of control success across both text and text-to-image generation tasks, offering a principled alternative to current evaluation practices.
%U https://aclanthology.org/2026.eacl-long.48/
%P 1101-1114
Markdown (Informal)
[Do NOT Classify and Count: Hybrid Attribute Control Success Evaluation](https://aclanthology.org/2026.eacl-long.48/) (Saaro et al., EACL 2026)
ACL
- Felix Matthias Saaro, Pius von Däniken, Mark Cieliebak, and Jan Milan Deriu. 2026. Do NOT Classify and Count: Hybrid Attribute Control Success Evaluation. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1101–1114, Rabat, Morocco. Association for Computational Linguistics.