@inproceedings{vidigal-2026-textblob,
title = "From {T}ext{B}lob to {LLM} Agents: Sentiment Model Selection for {B}2{B} Technical Support with {CSAT} Ground Truth",
author = "Vidigal, Pedro",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.121/",
pages = "1774--1782",
ISBN = "979-8-89176-394-4",
abstract = "We present a five-year case study of sentiment model selection for customer satisfaction (CSAT) prediction in B2B technical support. Our evaluation uses the complete population of CSAT-rated tickets from an enterprise software company: over 500 tickets comprising ${\sim}$2,500 customer comments from 100+ organizations over five years. We evaluate 17 approaches across 5 paradigms (lexicon, off-the-shelf transformers, NLI zero-shot, multi-task LLM agent, and 12 dedicated LLM agents from 6 vendor families), plus 11 fine-tuning experiments (all achieving MCC$\leq$0). Key findings: (1) a dedicated single-task LLM agent reduces neutral bias from 69{\%} to 22{\%}, improving MCC from $-$0.018 to 0.347 ($p$$<$0.001); (2) our results are consistent with the ``Alignment Tax'' (Lin et al., 2024; Wu et al., 2025) in sentiment classification: Claude Opus 4.6 exhibits 41{\%} neutral predictions and lower recall than its budget model Haiku 4.5 ($p$=0.003); (3) ${\sim}$38{\%} of dissatisfied customers are undetectable by all 12 LLMs due to administrative requests lacking emotional language; (4) Gemini 3 Flash achieves the best MCC (0.347) at $0.60/1K, over 100${\texttimes}$ cheaper than Claude Opus. We describe the three-phase production deployment and provide practitioner recommendations.$"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vidigal-2026-textblob">
<titleInfo>
<title>From TextBlob to LLM Agents: Sentiment Model Selection for B2B Technical Support with CSAT Ground Truth</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pedro</namePart>
<namePart type="family">Vidigal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>We present a five-year case study of sentiment model selection for customer satisfaction (CSAT) prediction in B2B technical support. Our evaluation uses the complete population of CSAT-rated tickets from an enterprise software company: over 500 tickets comprising \sim2,500 customer comments from 100+ organizations over five years. We evaluate 17 approaches across 5 paradigms (lexicon, off-the-shelf transformers, NLI zero-shot, multi-task LLM agent, and 12 dedicated LLM agents from 6 vendor families), plus 11 fine-tuning experiments (all achieving MCCłeq0). Key findings: (1) a dedicated single-task LLM agent reduces neutral bias from 69% to 22%, improving MCC from -0.018 to 0.347 (p<0.001); (2) our results are consistent with the “Alignment Tax” (Lin et al., 2024; Wu et al., 2025) in sentiment classification: Claude Opus 4.6 exhibits 41% neutral predictions and lower recall than its budget model Haiku 4.5 (p=0.003); (3) \sim38% of dissatisfied customers are undetectable by all 12 LLMs due to administrative requests lacking emotional language; (4) Gemini 3 Flash achieves the best MCC (0.347) at 0.60/1K, over 100× cheaper than Claude Opus. We describe the three-phase production deployment and provide practitioner recommendations.</abstract>
<identifier type="citekey">vidigal-2026-textblob</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.121/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1774</start>
<end>1782</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From TextBlob to LLM Agents: Sentiment Model Selection for B2B Technical Support with CSAT Ground Truth
%A Vidigal, Pedro
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F vidigal-2026-textblob
%X We present a five-year case study of sentiment model selection for customer satisfaction (CSAT) prediction in B2B technical support. Our evaluation uses the complete population of CSAT-rated tickets from an enterprise software company: over 500 tickets comprising \sim2,500 customer comments from 100+ organizations over five years. We evaluate 17 approaches across 5 paradigms (lexicon, off-the-shelf transformers, NLI zero-shot, multi-task LLM agent, and 12 dedicated LLM agents from 6 vendor families), plus 11 fine-tuning experiments (all achieving MCCłeq0). Key findings: (1) a dedicated single-task LLM agent reduces neutral bias from 69% to 22%, improving MCC from -0.018 to 0.347 (p<0.001); (2) our results are consistent with the “Alignment Tax” (Lin et al., 2024; Wu et al., 2025) in sentiment classification: Claude Opus 4.6 exhibits 41% neutral predictions and lower recall than its budget model Haiku 4.5 (p=0.003); (3) \sim38% of dissatisfied customers are undetectable by all 12 LLMs due to administrative requests lacking emotional language; (4) Gemini 3 Flash achieves the best MCC (0.347) at 0.60/1K, over 100× cheaper than Claude Opus. We describe the three-phase production deployment and provide practitioner recommendations.
%U https://aclanthology.org/2026.acl-industry.121/
%P 1774-1782
Markdown (Informal)
[From TextBlob to LLM Agents: Sentiment Model Selection for B2B Technical Support with CSAT Ground Truth](https://aclanthology.org/2026.acl-industry.121/) (Vidigal, ACL 2026)
ACL