@inproceedings{el-baff-etal-2025-criticalbrew,
title = "{C}ritical{B}rew at {CQ}s-Gen 2025: Collaborative Multi-Agent Generation and Evaluation of Critical Questions for Arguments",
author = "El Baff, Roxanne and
Opitz, Dominik and
Diallo, Diaoul{\'e}",
editor = "Chistova, Elena and
Cimiano, Philipp and
Haddadan, Shohreh and
Lapesa, Gabriella and
Ruiz-Dolz, Ramon",
booktitle = "Proceedings of the 12th Argument mining Workshop",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.argmining-1.30/",
doi = "10.18653/v1/2025.argmining-1.30",
pages = "314--321",
ISBN = "979-8-89176-258-9",
abstract = "This paper presents the \textit{CriticalBrew} submission to the CQs-Gen 2025 shared task, which focuses on generating critical questions (CQs) for a given argument. Our approach employs a multi-agent framework containing two sequential components: 1) \textbf{Generation}: machine society simulation for generating CQs and 2) \textbf{Evaluation}: LLM-based evaluation for selecting the top three questions. The first models collaboration as a sequence of thinking patterns (e.g., \textit{debate} {\textrightarrow} \textit{reflect}). The second assesses the generated questions using zero-shot prompting, evaluating them against several criteria (e.g., depth). Experiments with different open-weight LLMs (small vs. large) consistently outperformed the baseline, a single LLM with zero-shot prompting. Two configurations, agent count and thinking patterns, significantly impacted the performance in the shared task{'}s CQ-usefulness evaluation, whereas different LLM-based evaluation strategies (e.g., scoring) had no impact. Our code is available on GitHub."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="el-baff-etal-2025-criticalbrew">
<titleInfo>
<title>CriticalBrew at CQs-Gen 2025: Collaborative Multi-Agent Generation and Evaluation of Critical Questions for Arguments</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roxanne</namePart>
<namePart type="family">El Baff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dominik</namePart>
<namePart type="family">Opitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diaoulé</namePart>
<namePart type="family">Diallo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Argument mining Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Chistova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Cimiano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shohreh</namePart>
<namePart type="family">Haddadan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriella</namePart>
<namePart type="family">Lapesa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramon</namePart>
<namePart type="family">Ruiz-Dolz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-258-9</identifier>
</relatedItem>
<abstract>This paper presents the CriticalBrew submission to the CQs-Gen 2025 shared task, which focuses on generating critical questions (CQs) for a given argument. Our approach employs a multi-agent framework containing two sequential components: 1) Generation: machine society simulation for generating CQs and 2) Evaluation: LLM-based evaluation for selecting the top three questions. The first models collaboration as a sequence of thinking patterns (e.g., debate → reflect). The second assesses the generated questions using zero-shot prompting, evaluating them against several criteria (e.g., depth). Experiments with different open-weight LLMs (small vs. large) consistently outperformed the baseline, a single LLM with zero-shot prompting. Two configurations, agent count and thinking patterns, significantly impacted the performance in the shared task’s CQ-usefulness evaluation, whereas different LLM-based evaluation strategies (e.g., scoring) had no impact. Our code is available on GitHub.</abstract>
<identifier type="citekey">el-baff-etal-2025-criticalbrew</identifier>
<identifier type="doi">10.18653/v1/2025.argmining-1.30</identifier>
<location>
<url>https://aclanthology.org/2025.argmining-1.30/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>314</start>
<end>321</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CriticalBrew at CQs-Gen 2025: Collaborative Multi-Agent Generation and Evaluation of Critical Questions for Arguments
%A El Baff, Roxanne
%A Opitz, Dominik
%A Diallo, Diaoulé
%Y Chistova, Elena
%Y Cimiano, Philipp
%Y Haddadan, Shohreh
%Y Lapesa, Gabriella
%Y Ruiz-Dolz, Ramon
%S Proceedings of the 12th Argument mining Workshop
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-258-9
%F el-baff-etal-2025-criticalbrew
%X This paper presents the CriticalBrew submission to the CQs-Gen 2025 shared task, which focuses on generating critical questions (CQs) for a given argument. Our approach employs a multi-agent framework containing two sequential components: 1) Generation: machine society simulation for generating CQs and 2) Evaluation: LLM-based evaluation for selecting the top three questions. The first models collaboration as a sequence of thinking patterns (e.g., debate → reflect). The second assesses the generated questions using zero-shot prompting, evaluating them against several criteria (e.g., depth). Experiments with different open-weight LLMs (small vs. large) consistently outperformed the baseline, a single LLM with zero-shot prompting. Two configurations, agent count and thinking patterns, significantly impacted the performance in the shared task’s CQ-usefulness evaluation, whereas different LLM-based evaluation strategies (e.g., scoring) had no impact. Our code is available on GitHub.
%R 10.18653/v1/2025.argmining-1.30
%U https://aclanthology.org/2025.argmining-1.30/
%U https://doi.org/10.18653/v1/2025.argmining-1.30
%P 314-321
Markdown (Informal)
[CriticalBrew at CQs-Gen 2025: Collaborative Multi-Agent Generation and Evaluation of Critical Questions for Arguments](https://aclanthology.org/2025.argmining-1.30/) (El Baff et al., ArgMining 2025)
ACL