@inproceedings{venkatakrishnan-etal-2026-speaks,
title = "Who Speaks for Whom? {LLM}-Generated Survey Data as a Proxy for Public Opinion",
author = "Venkatakrishnan, Radhakrishnan and
Brodbeck, Travis and
Young, Michael D.",
editor = "Card, Dallas and
Field, Anjalie and
Keith, Katherine and
Mendelsohn, Julia",
booktitle = "Proceedings of the Seventh Workshop on Natural Language Processing and Computational Social Science",
month = jul,
year = "2026",
address = "San Diego",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.nlpcss-1.9/",
pages = "133--148",
ISBN = "979-8-89176-426-2",
abstract = "Technological advancements, such as Large Language Models (LLMs), offer a potential solution to the two-faceted problem facing social science researchers: rising costs and declining response rates. The use of artificial personas is a budding practice, where chatbots are given the demographic characteristics of the person they are supposed to role-play as and answer questions for researchers. Before scholars and practitioners augment or replace the data created by interviewing humans, it is essential to understand how well models perform in generating accurate, reliable, and robust data, with concerns that the training of LLMs results in a bias towards the norms of WEIRD cultures. We present a procedure for practitioners to use to evaluate the quality of their synthetic data by measuring Intra Class Correlation (ICC), Earth Mover Distance (EMD), Variance, Hedging, and demographic drivers of LLM output. We find that the models may generate plausible results in the aggregate, but these synthetic data do not exhibit the depth or nuance of human respondents. Secondarily, we find that despite having generated definitive answers on a ten-point scale, the reasoning provided by the LLM exhibited varying degrees of hedging that do not consistently align with the LLM{'}s answer. The distortion of the results was not uniformly distributed; instead, the effects were more extreme for some demographic groups. Our findings suggest that the technology generating synthetic survey data may not be mature enough to address the increasing challenges of interviewing humans for public opinion research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="venkatakrishnan-etal-2026-speaks">
<titleInfo>
<title>Who Speaks for Whom? LLM-Generated Survey Data as a Proxy for Public Opinion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Radhakrishnan</namePart>
<namePart type="family">Venkatakrishnan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Travis</namePart>
<namePart type="family">Brodbeck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="given">D</namePart>
<namePart type="family">Young</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh Workshop on Natural Language Processing and Computational Social Science</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dallas</namePart>
<namePart type="family">Card</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anjalie</namePart>
<namePart type="family">Field</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katherine</namePart>
<namePart type="family">Keith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Mendelsohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-426-2</identifier>
</relatedItem>
<abstract>Technological advancements, such as Large Language Models (LLMs), offer a potential solution to the two-faceted problem facing social science researchers: rising costs and declining response rates. The use of artificial personas is a budding practice, where chatbots are given the demographic characteristics of the person they are supposed to role-play as and answer questions for researchers. Before scholars and practitioners augment or replace the data created by interviewing humans, it is essential to understand how well models perform in generating accurate, reliable, and robust data, with concerns that the training of LLMs results in a bias towards the norms of WEIRD cultures. We present a procedure for practitioners to use to evaluate the quality of their synthetic data by measuring Intra Class Correlation (ICC), Earth Mover Distance (EMD), Variance, Hedging, and demographic drivers of LLM output. We find that the models may generate plausible results in the aggregate, but these synthetic data do not exhibit the depth or nuance of human respondents. Secondarily, we find that despite having generated definitive answers on a ten-point scale, the reasoning provided by the LLM exhibited varying degrees of hedging that do not consistently align with the LLM’s answer. The distortion of the results was not uniformly distributed; instead, the effects were more extreme for some demographic groups. Our findings suggest that the technology generating synthetic survey data may not be mature enough to address the increasing challenges of interviewing humans for public opinion research.</abstract>
<identifier type="citekey">venkatakrishnan-etal-2026-speaks</identifier>
<location>
<url>https://aclanthology.org/2026.nlpcss-1.9/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>133</start>
<end>148</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Who Speaks for Whom? LLM-Generated Survey Data as a Proxy for Public Opinion
%A Venkatakrishnan, Radhakrishnan
%A Brodbeck, Travis
%A Young, Michael D.
%Y Card, Dallas
%Y Field, Anjalie
%Y Keith, Katherine
%Y Mendelsohn, Julia
%S Proceedings of the Seventh Workshop on Natural Language Processing and Computational Social Science
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego
%@ 979-8-89176-426-2
%F venkatakrishnan-etal-2026-speaks
%X Technological advancements, such as Large Language Models (LLMs), offer a potential solution to the two-faceted problem facing social science researchers: rising costs and declining response rates. The use of artificial personas is a budding practice, where chatbots are given the demographic characteristics of the person they are supposed to role-play as and answer questions for researchers. Before scholars and practitioners augment or replace the data created by interviewing humans, it is essential to understand how well models perform in generating accurate, reliable, and robust data, with concerns that the training of LLMs results in a bias towards the norms of WEIRD cultures. We present a procedure for practitioners to use to evaluate the quality of their synthetic data by measuring Intra Class Correlation (ICC), Earth Mover Distance (EMD), Variance, Hedging, and demographic drivers of LLM output. We find that the models may generate plausible results in the aggregate, but these synthetic data do not exhibit the depth or nuance of human respondents. Secondarily, we find that despite having generated definitive answers on a ten-point scale, the reasoning provided by the LLM exhibited varying degrees of hedging that do not consistently align with the LLM’s answer. The distortion of the results was not uniformly distributed; instead, the effects were more extreme for some demographic groups. Our findings suggest that the technology generating synthetic survey data may not be mature enough to address the increasing challenges of interviewing humans for public opinion research.
%U https://aclanthology.org/2026.nlpcss-1.9/
%P 133-148
Markdown (Informal)
[Who Speaks for Whom? LLM-Generated Survey Data as a Proxy for Public Opinion](https://aclanthology.org/2026.nlpcss-1.9/) (Venkatakrishnan et al., NLP+CSS 2026)
ACL