@inproceedings{nascimento-etal-2026-social,
title = "Social-{RAG}: A Retrieval-Augmented Generation Pipeline for Computational Social Science Research on Telegram",
author = "Nascimento, Leonardo and
Brasil, Eric and
Lima, Arthur and
Andrade, Gabriel and
Andrade, Ricardo Jos{\'e} and
Barreto, Tarssio",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 2",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-2.34/",
pages = "255--265",
ISBN = "979-8-89176-387-6",
abstract = "Digital trace data have expanded empirical opportunities in the social sciences while intensifying the methodological challenge of scale: researchers increasingly face corpora too large and fast-moving to read exhaustively without sacrificing interpretive rigor. This article presents Social-RAG, a modular Retrieval-Augmented Generation (RAG) architecture designed to support scalable qualitative inquiry over large text corpora while preserving evidence traceability, auditability, and researcher control. Our empirical basis consists of messages from public Telegram groups and channels, organized into two thematic subsets: vaccine-related discourse and debates surrounding Brazil{'}s Lei Rouanet cultural funding policy. We detail key design decisions, including a ``one post = one chunk'' indexing strategy, semantic retrieval over vector embeddings with efficient ANN search, an Adaptive-K dynamic cutoff for context selection, MMR re-ranking for diversity, and structured analytical instructions that constrain generation to retrieved evidence. We evaluate system behavior using two complementary question blocks, hermeneutic (narrative) and factual, and compare outputs across three language models with distinct deployment profiles (a local open-weight model, a cloud open-weight model, and a commercial closed model), using an LLM-as-judge protocol with explicit qualitative criteria. Results show consistent behaviour across both thematic corpora and highlight a key trade-off: the two larger/closed models perform similarly and robustly in both narrative and factual tasks when evidential discipline is maintained, whereas the smaller local model remains useful for exploratory narrative synthesis but is less reliable for strict factual extraction and attribution. We conclude by discussing methodological implications, limitations, and future directions, with a focus on scalability and extensibility to new data types and analytical problems."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nascimento-etal-2026-social">
<titleInfo>
<title>Social-RAG: A Retrieval-Augmented Generation Pipeline for Computational Social Science Research on Telegram</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Nascimento</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Brasil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arthur</namePart>
<namePart type="family">Lima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Andrade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="given">José</namePart>
<namePart type="family">Andrade</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tarssio</namePart>
<namePart type="family">Barreto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Digital trace data have expanded empirical opportunities in the social sciences while intensifying the methodological challenge of scale: researchers increasingly face corpora too large and fast-moving to read exhaustively without sacrificing interpretive rigor. This article presents Social-RAG, a modular Retrieval-Augmented Generation (RAG) architecture designed to support scalable qualitative inquiry over large text corpora while preserving evidence traceability, auditability, and researcher control. Our empirical basis consists of messages from public Telegram groups and channels, organized into two thematic subsets: vaccine-related discourse and debates surrounding Brazil’s Lei Rouanet cultural funding policy. We detail key design decisions, including a “one post = one chunk” indexing strategy, semantic retrieval over vector embeddings with efficient ANN search, an Adaptive-K dynamic cutoff for context selection, MMR re-ranking for diversity, and structured analytical instructions that constrain generation to retrieved evidence. We evaluate system behavior using two complementary question blocks, hermeneutic (narrative) and factual, and compare outputs across three language models with distinct deployment profiles (a local open-weight model, a cloud open-weight model, and a commercial closed model), using an LLM-as-judge protocol with explicit qualitative criteria. Results show consistent behaviour across both thematic corpora and highlight a key trade-off: the two larger/closed models perform similarly and robustly in both narrative and factual tasks when evidential discipline is maintained, whereas the smaller local model remains useful for exploratory narrative synthesis but is less reliable for strict factual extraction and attribution. We conclude by discussing methodological implications, limitations, and future directions, with a focus on scalability and extensibility to new data types and analytical problems.</abstract>
<identifier type="citekey">nascimento-etal-2026-social</identifier>
<location>
<url>https://aclanthology.org/2026.propor-2.34/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>255</start>
<end>265</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Social-RAG: A Retrieval-Augmented Generation Pipeline for Computational Social Science Research on Telegram
%A Nascimento, Leonardo
%A Brasil, Eric
%A Lima, Arthur
%A Andrade, Gabriel
%A Andrade, Ricardo José
%A Barreto, Tarssio
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F nascimento-etal-2026-social
%X Digital trace data have expanded empirical opportunities in the social sciences while intensifying the methodological challenge of scale: researchers increasingly face corpora too large and fast-moving to read exhaustively without sacrificing interpretive rigor. This article presents Social-RAG, a modular Retrieval-Augmented Generation (RAG) architecture designed to support scalable qualitative inquiry over large text corpora while preserving evidence traceability, auditability, and researcher control. Our empirical basis consists of messages from public Telegram groups and channels, organized into two thematic subsets: vaccine-related discourse and debates surrounding Brazil’s Lei Rouanet cultural funding policy. We detail key design decisions, including a “one post = one chunk” indexing strategy, semantic retrieval over vector embeddings with efficient ANN search, an Adaptive-K dynamic cutoff for context selection, MMR re-ranking for diversity, and structured analytical instructions that constrain generation to retrieved evidence. We evaluate system behavior using two complementary question blocks, hermeneutic (narrative) and factual, and compare outputs across three language models with distinct deployment profiles (a local open-weight model, a cloud open-weight model, and a commercial closed model), using an LLM-as-judge protocol with explicit qualitative criteria. Results show consistent behaviour across both thematic corpora and highlight a key trade-off: the two larger/closed models perform similarly and robustly in both narrative and factual tasks when evidential discipline is maintained, whereas the smaller local model remains useful for exploratory narrative synthesis but is less reliable for strict factual extraction and attribution. We conclude by discussing methodological implications, limitations, and future directions, with a focus on scalability and extensibility to new data types and analytical problems.
%U https://aclanthology.org/2026.propor-2.34/
%P 255-265
Markdown (Informal)
[Social-RAG: A Retrieval-Augmented Generation Pipeline for Computational Social Science Research on Telegram](https://aclanthology.org/2026.propor-2.34/) (Nascimento et al., PROPOR 2026)
ACL