@inproceedings{karip-hossain-2026-alienannotators,
title = "{A}lien{A}nnotators at {P}sy{D}ef{D}etect: What Lies Between the Lines: Probing Lightweight Open-Source {LLM}s for Psychological Defense Mechanism Detection",
author = "Karip, Siam and
Hossain, Nahid",
editor = "Gupta, Deepak and
Demner-Fushman, Dina",
booktitle = "Proceedings of the {B}io{NLP} 2026 (Shared Tasks)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.bionlp-2.28/",
pages = "213--223",
ISBN = "979-8-89176-435-4",
abstract = "Detecting psychological defense mechanisms in therapy dialogue is a clinically valuable but computationally underexplored task. We present our systematic analysis for PsyDefDetect, a shared task at BioNLP@ACL 2026, which frames defense detection as a nine-class utterance-level classification problem based on the Defense Mechanism Rating Scale (DMRS). We systematically evaluate six open-source, instruction-tuned small language models (SLMs, = 9B parameters) in zero-shot and fine-tuning settings, and compare a clinically-grounded prompt against the organizer-provided baseline. Our official submission achieved 59.96{\%} accuracy and 16.28{\%} Macro F1. Post-submission experiments show that fine-tuning combined with 5-fold cross-validation and logit averaging ensemble substantially improves performance, with the best configuration reaching 34.59{\%} Macro F1 and 65.25{\%} accuracy. We find that clinically-grounded prompts outperform bare label definitions, model scale does not consistently improve zero-shot performance, and fine-tuning dramatically recovers even collapsed zero-shot models. Certain defense tiers remain persistently difficult across all settings, pointing to clinical ambiguity at tier boundaries as a more fundamental bottleneck than data imbalance alone."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="karip-hossain-2026-alienannotators">
<titleInfo>
<title>AlienAnnotators at PsyDefDetect: What Lies Between the Lines: Probing Lightweight Open-Source LLMs for Psychological Defense Mechanism Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siam</namePart>
<namePart type="family">Karip</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nahid</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the BioNLP 2026 (Shared Tasks)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deepak</namePart>
<namePart type="family">Gupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-435-4</identifier>
</relatedItem>
<abstract>Detecting psychological defense mechanisms in therapy dialogue is a clinically valuable but computationally underexplored task. We present our systematic analysis for PsyDefDetect, a shared task at BioNLP@ACL 2026, which frames defense detection as a nine-class utterance-level classification problem based on the Defense Mechanism Rating Scale (DMRS). We systematically evaluate six open-source, instruction-tuned small language models (SLMs, = 9B parameters) in zero-shot and fine-tuning settings, and compare a clinically-grounded prompt against the organizer-provided baseline. Our official submission achieved 59.96% accuracy and 16.28% Macro F1. Post-submission experiments show that fine-tuning combined with 5-fold cross-validation and logit averaging ensemble substantially improves performance, with the best configuration reaching 34.59% Macro F1 and 65.25% accuracy. We find that clinically-grounded prompts outperform bare label definitions, model scale does not consistently improve zero-shot performance, and fine-tuning dramatically recovers even collapsed zero-shot models. Certain defense tiers remain persistently difficult across all settings, pointing to clinical ambiguity at tier boundaries as a more fundamental bottleneck than data imbalance alone.</abstract>
<identifier type="citekey">karip-hossain-2026-alienannotators</identifier>
<location>
<url>https://aclanthology.org/2026.bionlp-2.28/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>213</start>
<end>223</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AlienAnnotators at PsyDefDetect: What Lies Between the Lines: Probing Lightweight Open-Source LLMs for Psychological Defense Mechanism Detection
%A Karip, Siam
%A Hossain, Nahid
%Y Gupta, Deepak
%Y Demner-Fushman, Dina
%S Proceedings of the BioNLP 2026 (Shared Tasks)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-435-4
%F karip-hossain-2026-alienannotators
%X Detecting psychological defense mechanisms in therapy dialogue is a clinically valuable but computationally underexplored task. We present our systematic analysis for PsyDefDetect, a shared task at BioNLP@ACL 2026, which frames defense detection as a nine-class utterance-level classification problem based on the Defense Mechanism Rating Scale (DMRS). We systematically evaluate six open-source, instruction-tuned small language models (SLMs, = 9B parameters) in zero-shot and fine-tuning settings, and compare a clinically-grounded prompt against the organizer-provided baseline. Our official submission achieved 59.96% accuracy and 16.28% Macro F1. Post-submission experiments show that fine-tuning combined with 5-fold cross-validation and logit averaging ensemble substantially improves performance, with the best configuration reaching 34.59% Macro F1 and 65.25% accuracy. We find that clinically-grounded prompts outperform bare label definitions, model scale does not consistently improve zero-shot performance, and fine-tuning dramatically recovers even collapsed zero-shot models. Certain defense tiers remain persistently difficult across all settings, pointing to clinical ambiguity at tier boundaries as a more fundamental bottleneck than data imbalance alone.
%U https://aclanthology.org/2026.bionlp-2.28/
%P 213-223
Markdown (Informal)
[AlienAnnotators at PsyDefDetect: What Lies Between the Lines: Probing Lightweight Open-Source LLMs for Psychological Defense Mechanism Detection](https://aclanthology.org/2026.bionlp-2.28/) (Karip & Hossain, BioNLP 2026)
ACL