@inproceedings{nikishina-etal-2026-argument,
title = "Argument-Based Comparative Question Answering Evaluation Benchmark",
author = "Nikishina, Irina and
Anwar, Saba and
Dolgov, Nikolay and
Manina, Maria and
Ignatenko, Daria and
Moskvoretskii, Viktor and
Shelmanov, Artem and
Baldwin, Tim and
Biemann, Chris",
editor = "Elaraby, Mohamed and
Hautli-Janisz, Annette and
Romberg, Julia and
Musi, Elena and
Ruggeri, Federico and
Lawrence, John",
booktitle = "Proceedings of the 13th Workshop on Argument Mining and Reasoning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.argmining-1.6/",
pages = "43--51",
ISBN = "979-8-89176-399-9",
abstract = "Despite the ability of large language models (LLMs) to generate coherent comparative answers, automatic comparative question answering (CQA) remains challenging due to the absence of standardized evaluation criteria and the high resource demands of manual assessment. To address these problems, this paper proposes a comprehensive evaluation framework designed to assess the quality of CQA summaries using LLMs-as-a-Judge. We formulate 15 evaluation criteria for assessing comparative answers generated by various sources, including LLMs, human experts, and prior work. To capture a diverse range of comparative answers, LLM summaries were generated under various prompting scenarios. We evaluate the effectiveness of our framework using both human assessment and LLMs, demonstrating the consistency between automated and manual evaluations. Finally, we fine-tune Llama-3-8B-Instruct on a dataset generated from the best-performing CQA models in our evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nikishina-etal-2026-argument">
<titleInfo>
<title>Argument-Based Comparative Question Answering Evaluation Benchmark</title>
</titleInfo>
<name type="personal">
<namePart type="given">Irina</namePart>
<namePart type="family">Nikishina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saba</namePart>
<namePart type="family">Anwar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolay</namePart>
<namePart type="family">Dolgov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Manina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daria</namePart>
<namePart type="family">Ignatenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viktor</namePart>
<namePart type="family">Moskvoretskii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Artem</namePart>
<namePart type="family">Shelmanov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tim</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Biemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th Workshop on Argument Mining and Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohamed</namePart>
<namePart type="family">Elaraby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Annette</namePart>
<namePart type="family">Hautli-Janisz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Romberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Musi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Federico</namePart>
<namePart type="family">Ruggeri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Lawrence</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-399-9</identifier>
</relatedItem>
<abstract>Despite the ability of large language models (LLMs) to generate coherent comparative answers, automatic comparative question answering (CQA) remains challenging due to the absence of standardized evaluation criteria and the high resource demands of manual assessment. To address these problems, this paper proposes a comprehensive evaluation framework designed to assess the quality of CQA summaries using LLMs-as-a-Judge. We formulate 15 evaluation criteria for assessing comparative answers generated by various sources, including LLMs, human experts, and prior work. To capture a diverse range of comparative answers, LLM summaries were generated under various prompting scenarios. We evaluate the effectiveness of our framework using both human assessment and LLMs, demonstrating the consistency between automated and manual evaluations. Finally, we fine-tune Llama-3-8B-Instruct on a dataset generated from the best-performing CQA models in our evaluation.</abstract>
<identifier type="citekey">nikishina-etal-2026-argument</identifier>
<location>
<url>https://aclanthology.org/2026.argmining-1.6/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>43</start>
<end>51</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Argument-Based Comparative Question Answering Evaluation Benchmark
%A Nikishina, Irina
%A Anwar, Saba
%A Dolgov, Nikolay
%A Manina, Maria
%A Ignatenko, Daria
%A Moskvoretskii, Viktor
%A Shelmanov, Artem
%A Baldwin, Tim
%A Biemann, Chris
%Y Elaraby, Mohamed
%Y Hautli-Janisz, Annette
%Y Romberg, Julia
%Y Musi, Elena
%Y Ruggeri, Federico
%Y Lawrence, John
%S Proceedings of the 13th Workshop on Argument Mining and Reasoning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-399-9
%F nikishina-etal-2026-argument
%X Despite the ability of large language models (LLMs) to generate coherent comparative answers, automatic comparative question answering (CQA) remains challenging due to the absence of standardized evaluation criteria and the high resource demands of manual assessment. To address these problems, this paper proposes a comprehensive evaluation framework designed to assess the quality of CQA summaries using LLMs-as-a-Judge. We formulate 15 evaluation criteria for assessing comparative answers generated by various sources, including LLMs, human experts, and prior work. To capture a diverse range of comparative answers, LLM summaries were generated under various prompting scenarios. We evaluate the effectiveness of our framework using both human assessment and LLMs, demonstrating the consistency between automated and manual evaluations. Finally, we fine-tune Llama-3-8B-Instruct on a dataset generated from the best-performing CQA models in our evaluation.
%U https://aclanthology.org/2026.argmining-1.6/
%P 43-51
Markdown (Informal)
[Argument-Based Comparative Question Answering Evaluation Benchmark](https://aclanthology.org/2026.argmining-1.6/) (Nikishina et al., ArgMining 2026)
ACL
- Irina Nikishina, Saba Anwar, Nikolay Dolgov, Maria Manina, Daria Ignatenko, Viktor Moskvoretskii, Artem Shelmanov, Tim Baldwin, and Chris Biemann. 2026. Argument-Based Comparative Question Answering Evaluation Benchmark. In Proceedings of the 13th Workshop on Argument Mining and Reasoning, pages 43–51, San Diego, California, USA. Association for Computational Linguistics.