@inproceedings{hengle-etal-2025-cseval,
title = "{CSE}val: Towards Automated, Multi-Dimensional, and Reference-Free Counterspeech Evaluation using Auto-Calibrated {LLM}s",
author = "Hengle, Amey and
Padhi, Aswini Kumar and
Bandhakavi, Anil and
Chakraborty, Tanmoy",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-long.279/",
doi = "10.18653/v1/2025.naacl-long.279",
pages = "5402--5419",
ISBN = "979-8-89176-189-6",
abstract = "Counterspeech has emerged as a popular and effective strategy for combating online hate speech, sparking growing research interest in automating its generation using language models. However, the field still lacks standardised evaluation protocols and reliable automated evaluation metrics that align with human judgement. Current automatic evaluation methods, primarily based on similarity metrics, do not effectively capture the complex and independent attributes of counterspeech quality, such as contextual relevance, aggressiveness, or argumentative coherence. This has led to an increased dependency on labor-intensive human evaluations to assess automated counter-speech generation methods. To address these challenges, we introduce `CSEval{`}, a novel dataset and framework for evaluating counterspeech quality across four dimensions: *contextual-relevance*, *aggressiveness*, *argument-coherence*, and *suitableness*. Furthermore, we propose *Auto-Calibrated COT for Counterspeech Evaluation* ({`}Auto-CSEval{`}), a prompt-based method with auto-calibrated chain-of-thoughts (CoT) for scoring counterspeech using large language models. Our experiments show that `Auto-CSEval{`} outperforms traditional metrics like ROUGE, METEOR, and BertScore in correlating with human judgement, indicating a significant improvement in automated counterspeech evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hengle-etal-2025-cseval">
<titleInfo>
<title>CSEval: Towards Automated, Multi-Dimensional, and Reference-Free Counterspeech Evaluation using Auto-Calibrated LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Amey</namePart>
<namePart type="family">Hengle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aswini</namePart>
<namePart type="given">Kumar</namePart>
<namePart type="family">Padhi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anil</namePart>
<namePart type="family">Bandhakavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luis</namePart>
<namePart type="family">Chiruzzo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alan</namePart>
<namePart type="family">Ritter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Albuquerque, New Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-189-6</identifier>
</relatedItem>
<abstract>Counterspeech has emerged as a popular and effective strategy for combating online hate speech, sparking growing research interest in automating its generation using language models. However, the field still lacks standardised evaluation protocols and reliable automated evaluation metrics that align with human judgement. Current automatic evaluation methods, primarily based on similarity metrics, do not effectively capture the complex and independent attributes of counterspeech quality, such as contextual relevance, aggressiveness, or argumentative coherence. This has led to an increased dependency on labor-intensive human evaluations to assess automated counter-speech generation methods. To address these challenges, we introduce ‘CSEval‘, a novel dataset and framework for evaluating counterspeech quality across four dimensions: *contextual-relevance*, *aggressiveness*, *argument-coherence*, and *suitableness*. Furthermore, we propose *Auto-Calibrated COT for Counterspeech Evaluation* (‘Auto-CSEval‘), a prompt-based method with auto-calibrated chain-of-thoughts (CoT) for scoring counterspeech using large language models. Our experiments show that ‘Auto-CSEval‘ outperforms traditional metrics like ROUGE, METEOR, and BertScore in correlating with human judgement, indicating a significant improvement in automated counterspeech evaluation.</abstract>
<identifier type="citekey">hengle-etal-2025-cseval</identifier>
<identifier type="doi">10.18653/v1/2025.naacl-long.279</identifier>
<location>
<url>https://aclanthology.org/2025.naacl-long.279/</url>
</location>
<part>
<date>2025-04</date>
<extent unit="page">
<start>5402</start>
<end>5419</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T CSEval: Towards Automated, Multi-Dimensional, and Reference-Free Counterspeech Evaluation using Auto-Calibrated LLMs
%A Hengle, Amey
%A Padhi, Aswini Kumar
%A Bandhakavi, Anil
%A Chakraborty, Tanmoy
%Y Chiruzzo, Luis
%Y Ritter, Alan
%Y Wang, Lu
%S Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)
%D 2025
%8 April
%I Association for Computational Linguistics
%C Albuquerque, New Mexico
%@ 979-8-89176-189-6
%F hengle-etal-2025-cseval
%X Counterspeech has emerged as a popular and effective strategy for combating online hate speech, sparking growing research interest in automating its generation using language models. However, the field still lacks standardised evaluation protocols and reliable automated evaluation metrics that align with human judgement. Current automatic evaluation methods, primarily based on similarity metrics, do not effectively capture the complex and independent attributes of counterspeech quality, such as contextual relevance, aggressiveness, or argumentative coherence. This has led to an increased dependency on labor-intensive human evaluations to assess automated counter-speech generation methods. To address these challenges, we introduce ‘CSEval‘, a novel dataset and framework for evaluating counterspeech quality across four dimensions: *contextual-relevance*, *aggressiveness*, *argument-coherence*, and *suitableness*. Furthermore, we propose *Auto-Calibrated COT for Counterspeech Evaluation* (‘Auto-CSEval‘), a prompt-based method with auto-calibrated chain-of-thoughts (CoT) for scoring counterspeech using large language models. Our experiments show that ‘Auto-CSEval‘ outperforms traditional metrics like ROUGE, METEOR, and BertScore in correlating with human judgement, indicating a significant improvement in automated counterspeech evaluation.
%R 10.18653/v1/2025.naacl-long.279
%U https://aclanthology.org/2025.naacl-long.279/
%U https://doi.org/10.18653/v1/2025.naacl-long.279
%P 5402-5419
Markdown (Informal)
[CSEval: Towards Automated, Multi-Dimensional, and Reference-Free Counterspeech Evaluation using Auto-Calibrated LLMs](https://aclanthology.org/2025.naacl-long.279/) (Hengle et al., NAACL 2025)
ACL