@inproceedings{sharma-etal-2026-council,
title = "Council of {LLM}s: Evaluating Capability of Large Language Models to Annotate Propaganda",
author = "Sharma, Vivek and
Jain, Shweta and
Shokri, Mohammad and
Levitan, Sarah Ita and
Filatova, Elena",
editor = "Barnes, Jeremy and
Barriere, Valentin and
De Clercq, Orph{\'e}e and
Klinger, Roman and
Nouri, C{\'e}lia and
Nozza, Debora and
Singh, Pranaydeep",
booktitle = "The Proceedings for the 15th Workshop on Computational Approaches to Subjectivity, Sentiment Social Media Analysis ({WASSA} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.wassa-1.1/",
pages = "1--12",
ISBN = "979-8-89176-378-4",
abstract = "Data annotation is essential for supervised natural language processing tasks but remains labor-intensive and expensive. Large language models (LLMs) have emerged as promising alternatives, capable of generating high-quality annotations either autonomously or in collaboration with human annotators. However their use in autonomous annotations is often questioned for their ethical take on subjective matters. This study investigates the effectiveness of LLMs in a autonomous, and hybrid annotation setups in propaganda detection. We evaluate GPT and open-source models on two datasets from different domains, namely, Propaganda Techniques Corpus (PTC) for news articles and the Journalist Media Bias on X (JMBX) for social media. Our results show that LLMs, in general, exhibit high recall but lower precision in detecting propaganda, often over-predicting persuasive content. Multi-annotator setups did not outperform the best models in single-annotator setting although it helped reasoning models boost their performance. Hybrid annotation, combining LLMs and human input, achieved the highest overall accuracy than LLM-only settings. We further analyze misclassifications and found that LLM have higher sensitivity towards certain propaganda techniques like loaded language, name calling, and doubt. Finally, using error typology analysis, we explore the reasoning provided on misclassifications by the LLM. Our result shows that although some studies report LLM outperforming manual annotations and it could prove useful in hybrid annotation, its incorporation in the human annotation pipeline must be implemented with caution."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sharma-etal-2026-council">
<titleInfo>
<title>Council of LLMs: Evaluating Capability of Large Language Models to Annotate Propaganda</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shweta</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="family">Shokri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="given">Ita</namePart>
<namePart type="family">Levitan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Filatova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The Proceedings for the 15th Workshop on Computational Approaches to Subjectivity, Sentiment Social Media Analysis (WASSA 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jeremy</namePart>
<namePart type="family">Barnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Barriere</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Orphée</namePart>
<namePart type="family">De Clercq</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Klinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Célia</namePart>
<namePart type="family">Nouri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debora</namePart>
<namePart type="family">Nozza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pranaydeep</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-378-4</identifier>
</relatedItem>
<abstract>Data annotation is essential for supervised natural language processing tasks but remains labor-intensive and expensive. Large language models (LLMs) have emerged as promising alternatives, capable of generating high-quality annotations either autonomously or in collaboration with human annotators. However their use in autonomous annotations is often questioned for their ethical take on subjective matters. This study investigates the effectiveness of LLMs in a autonomous, and hybrid annotation setups in propaganda detection. We evaluate GPT and open-source models on two datasets from different domains, namely, Propaganda Techniques Corpus (PTC) for news articles and the Journalist Media Bias on X (JMBX) for social media. Our results show that LLMs, in general, exhibit high recall but lower precision in detecting propaganda, often over-predicting persuasive content. Multi-annotator setups did not outperform the best models in single-annotator setting although it helped reasoning models boost their performance. Hybrid annotation, combining LLMs and human input, achieved the highest overall accuracy than LLM-only settings. We further analyze misclassifications and found that LLM have higher sensitivity towards certain propaganda techniques like loaded language, name calling, and doubt. Finally, using error typology analysis, we explore the reasoning provided on misclassifications by the LLM. Our result shows that although some studies report LLM outperforming manual annotations and it could prove useful in hybrid annotation, its incorporation in the human annotation pipeline must be implemented with caution.</abstract>
<identifier type="citekey">sharma-etal-2026-council</identifier>
<location>
<url>https://aclanthology.org/2026.wassa-1.1/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1</start>
<end>12</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Council of LLMs: Evaluating Capability of Large Language Models to Annotate Propaganda
%A Sharma, Vivek
%A Jain, Shweta
%A Shokri, Mohammad
%A Levitan, Sarah Ita
%A Filatova, Elena
%Y Barnes, Jeremy
%Y Barriere, Valentin
%Y De Clercq, Orphée
%Y Klinger, Roman
%Y Nouri, Célia
%Y Nozza, Debora
%Y Singh, Pranaydeep
%S The Proceedings for the 15th Workshop on Computational Approaches to Subjectivity, Sentiment Social Media Analysis (WASSA 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-378-4
%F sharma-etal-2026-council
%X Data annotation is essential for supervised natural language processing tasks but remains labor-intensive and expensive. Large language models (LLMs) have emerged as promising alternatives, capable of generating high-quality annotations either autonomously or in collaboration with human annotators. However their use in autonomous annotations is often questioned for their ethical take on subjective matters. This study investigates the effectiveness of LLMs in a autonomous, and hybrid annotation setups in propaganda detection. We evaluate GPT and open-source models on two datasets from different domains, namely, Propaganda Techniques Corpus (PTC) for news articles and the Journalist Media Bias on X (JMBX) for social media. Our results show that LLMs, in general, exhibit high recall but lower precision in detecting propaganda, often over-predicting persuasive content. Multi-annotator setups did not outperform the best models in single-annotator setting although it helped reasoning models boost their performance. Hybrid annotation, combining LLMs and human input, achieved the highest overall accuracy than LLM-only settings. We further analyze misclassifications and found that LLM have higher sensitivity towards certain propaganda techniques like loaded language, name calling, and doubt. Finally, using error typology analysis, we explore the reasoning provided on misclassifications by the LLM. Our result shows that although some studies report LLM outperforming manual annotations and it could prove useful in hybrid annotation, its incorporation in the human annotation pipeline must be implemented with caution.
%U https://aclanthology.org/2026.wassa-1.1/
%P 1-12
Markdown (Informal)
[Council of LLMs: Evaluating Capability of Large Language Models to Annotate Propaganda](https://aclanthology.org/2026.wassa-1.1/) (Sharma et al., WASSA 2026)
ACL