@inproceedings{turkstra-etal-2025-trillama,
title = "{T}ri{LL}a{M}a at {CQ}s-Gen 2025: A Two-Stage {LLM}-Based System for Critical Question Generation",
author = "Turkstra, Frieso and
Nabhani, Sara and
Al-Khatib, Khalid",
editor = "Chistova, Elena and
Cimiano, Philipp and
Haddadan, Shohreh and
Lapesa, Gabriella and
Ruiz-Dolz, Ramon",
booktitle = "Proceedings of the 12th Argument mining Workshop",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.argmining-1.34/",
doi = "10.18653/v1/2025.argmining-1.34",
pages = "349--357",
ISBN = "979-8-89176-258-9",
abstract = "This paper presents a new system for generating critical questions in debates, developed for the Critical Questions Generation shared task. Our two-stage approach, combining generation and classification, utilizes LLaMA 3.1 Instruct models (8B, 70B, 405B) with zero-/few-shot prompting. Evaluations on annotated debate data reveal several key insights: few-shot generation with 405B yielded relatively high-quality questions, achieving a maximum possible punctuation score of 73.5. The 70B model outperformed both smaller and larger variants on the classification part. The classifiers showed a strong bias toward labeling generated questions as Useful, despite limited validation. Further, our system, ranked 6 extsuperscriptth, out-performed baselines by 3{\%}. These findings stress the effectiveness of large-sized models for question generation and medium-sized models for classification, and suggest the need for clearer task definitions within prompts to improve classification accuracy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="turkstra-etal-2025-trillama">
<titleInfo>
<title>TriLLaMa at CQs-Gen 2025: A Two-Stage LLM-Based System for Critical Question Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Frieso</namePart>
<namePart type="family">Turkstra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Nabhani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Al-Khatib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Argument mining Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Chistova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Cimiano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shohreh</namePart>
<namePart type="family">Haddadan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriella</namePart>
<namePart type="family">Lapesa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ramon</namePart>
<namePart type="family">Ruiz-Dolz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-258-9</identifier>
</relatedItem>
<abstract>This paper presents a new system for generating critical questions in debates, developed for the Critical Questions Generation shared task. Our two-stage approach, combining generation and classification, utilizes LLaMA 3.1 Instruct models (8B, 70B, 405B) with zero-/few-shot prompting. Evaluations on annotated debate data reveal several key insights: few-shot generation with 405B yielded relatively high-quality questions, achieving a maximum possible punctuation score of 73.5. The 70B model outperformed both smaller and larger variants on the classification part. The classifiers showed a strong bias toward labeling generated questions as Useful, despite limited validation. Further, our system, ranked 6 extsuperscriptth, out-performed baselines by 3%. These findings stress the effectiveness of large-sized models for question generation and medium-sized models for classification, and suggest the need for clearer task definitions within prompts to improve classification accuracy.</abstract>
<identifier type="citekey">turkstra-etal-2025-trillama</identifier>
<identifier type="doi">10.18653/v1/2025.argmining-1.34</identifier>
<location>
<url>https://aclanthology.org/2025.argmining-1.34/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>349</start>
<end>357</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TriLLaMa at CQs-Gen 2025: A Two-Stage LLM-Based System for Critical Question Generation
%A Turkstra, Frieso
%A Nabhani, Sara
%A Al-Khatib, Khalid
%Y Chistova, Elena
%Y Cimiano, Philipp
%Y Haddadan, Shohreh
%Y Lapesa, Gabriella
%Y Ruiz-Dolz, Ramon
%S Proceedings of the 12th Argument mining Workshop
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-258-9
%F turkstra-etal-2025-trillama
%X This paper presents a new system for generating critical questions in debates, developed for the Critical Questions Generation shared task. Our two-stage approach, combining generation and classification, utilizes LLaMA 3.1 Instruct models (8B, 70B, 405B) with zero-/few-shot prompting. Evaluations on annotated debate data reveal several key insights: few-shot generation with 405B yielded relatively high-quality questions, achieving a maximum possible punctuation score of 73.5. The 70B model outperformed both smaller and larger variants on the classification part. The classifiers showed a strong bias toward labeling generated questions as Useful, despite limited validation. Further, our system, ranked 6 extsuperscriptth, out-performed baselines by 3%. These findings stress the effectiveness of large-sized models for question generation and medium-sized models for classification, and suggest the need for clearer task definitions within prompts to improve classification accuracy.
%R 10.18653/v1/2025.argmining-1.34
%U https://aclanthology.org/2025.argmining-1.34/
%U https://doi.org/10.18653/v1/2025.argmining-1.34
%P 349-357
Markdown (Informal)
[TriLLaMa at CQs-Gen 2025: A Two-Stage LLM-Based System for Critical Question Generation](https://aclanthology.org/2025.argmining-1.34/) (Turkstra et al., ArgMining 2025)
ACL