@inproceedings{gjurkovic-etal-2021-pandora,
title = "{PANDORA} Talks: Personality and Demographics on {R}eddit",
author = "Gjurkovi{\'c}, Matej and
Karan, Mladen and
Vukojevi{\'c}, Iva and
Bo{\v{s}}njak, Mihaela and
Snajder, Jan",
editor = "Ku, Lun-Wei and
Li, Cheng-Te",
booktitle = "Proceedings of the Ninth International Workshop on Natural Language Processing for Social Media",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.socialnlp-1.12",
doi = "10.18653/v1/2021.socialnlp-1.12",
pages = "138--152",
abstract = "Personality and demographics are important variables in social sciences and computational sociolinguistics. However, datasets with both personality and demographic labels are scarce. To address this, we present PANDORA, the first dataset of Reddit comments of 10k users partially labeled with three personality models and demographics (age, gender, and location), including 1.6k users labeled with the well-established Big 5 personality model. We showcase the usefulness of this dataset on three experiments, where we leverage the more readily available data from other personality models to predict the Big 5 traits, analyze gender classification biases arising from psycho-demographic variables, and carry out a confirmatory and exploratory analysis based on psychological theories. Finally, we present benchmark prediction models for all personality and demographic variables.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gjurkovic-etal-2021-pandora">
<titleInfo>
<title>PANDORA Talks: Personality and Demographics on Reddit</title>
</titleInfo>
<name type="personal">
<namePart type="given">Matej</namePart>
<namePart type="family">Gjurković</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mladen</namePart>
<namePart type="family">Karan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iva</namePart>
<namePart type="family">Vukojević</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mihaela</namePart>
<namePart type="family">Bošnjak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Snajder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Workshop on Natural Language Processing for Social Media</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng-Te</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Personality and demographics are important variables in social sciences and computational sociolinguistics. However, datasets with both personality and demographic labels are scarce. To address this, we present PANDORA, the first dataset of Reddit comments of 10k users partially labeled with three personality models and demographics (age, gender, and location), including 1.6k users labeled with the well-established Big 5 personality model. We showcase the usefulness of this dataset on three experiments, where we leverage the more readily available data from other personality models to predict the Big 5 traits, analyze gender classification biases arising from psycho-demographic variables, and carry out a confirmatory and exploratory analysis based on psychological theories. Finally, we present benchmark prediction models for all personality and demographic variables.</abstract>
<identifier type="citekey">gjurkovic-etal-2021-pandora</identifier>
<identifier type="doi">10.18653/v1/2021.socialnlp-1.12</identifier>
<location>
<url>https://aclanthology.org/2021.socialnlp-1.12</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>138</start>
<end>152</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PANDORA Talks: Personality and Demographics on Reddit
%A Gjurković, Matej
%A Karan, Mladen
%A Vukojević, Iva
%A Bošnjak, Mihaela
%A Snajder, Jan
%Y Ku, Lun-Wei
%Y Li, Cheng-Te
%S Proceedings of the Ninth International Workshop on Natural Language Processing for Social Media
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F gjurkovic-etal-2021-pandora
%X Personality and demographics are important variables in social sciences and computational sociolinguistics. However, datasets with both personality and demographic labels are scarce. To address this, we present PANDORA, the first dataset of Reddit comments of 10k users partially labeled with three personality models and demographics (age, gender, and location), including 1.6k users labeled with the well-established Big 5 personality model. We showcase the usefulness of this dataset on three experiments, where we leverage the more readily available data from other personality models to predict the Big 5 traits, analyze gender classification biases arising from psycho-demographic variables, and carry out a confirmatory and exploratory analysis based on psychological theories. Finally, we present benchmark prediction models for all personality and demographic variables.
%R 10.18653/v1/2021.socialnlp-1.12
%U https://aclanthology.org/2021.socialnlp-1.12
%U https://doi.org/10.18653/v1/2021.socialnlp-1.12
%P 138-152
Markdown (Informal)
[PANDORA Talks: Personality and Demographics on Reddit](https://aclanthology.org/2021.socialnlp-1.12) (Gjurković et al., SocialNLP 2021)
ACL
- Matej Gjurković, Mladen Karan, Iva Vukojević, Mihaela Bošnjak, and Jan Snajder. 2021. PANDORA Talks: Personality and Demographics on Reddit. In Proceedings of the Ninth International Workshop on Natural Language Processing for Social Media, pages 138–152, Online. Association for Computational Linguistics.