@inproceedings{rocca-yarkoni-2022-language,
title = "Language as a fingerprint: Self-supervised learning of user encodings using transformers",
author = "Rocca, Roberta and
Yarkoni, Tal",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-emnlp.123",
doi = "10.18653/v1/2022.findings-emnlp.123",
pages = "1701--1714",
abstract = "The way we talk carries information about who we are. Demographics, personality, clinical conditions, political preferences influence what we speak about and how, suggesting that many individual attributes could be inferred from adequate encodings of linguistic behavior. Conversely, conditioning text representations on author attributes has been shown to improve model performance in many NLP tasks. Previous research on individual differences and language representations has mainly focused on predicting selected attributes from text, or on conditioning text representations on such attributes for author-based contextualization. Here, we present a self-supervised approach to learning language-based user encodings using transformers. Using a large corpus of Reddit submissions, we fine-tune DistilBERT on user-based triplet loss. We show that fine-tuned models can pick up on complex linguistic signatures of users, and that they are able to infer rich information about them. Through a series of intrinsic analyses and probing tasks, we provide evidence that fine-tuning enhances models{'} ability to abstract generalizable user information, which yields performance advantages for user-based downstream tasks. We discuss applications in language-based assessment and contextualized and personalized NLP.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="rocca-yarkoni-2022-language">
<titleInfo>
<title>Language as a fingerprint: Self-supervised learning of user encodings using transformers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Roberta</namePart>
<namePart type="family">Rocca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Yarkoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yoav</namePart>
<namePart type="family">Goldberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zornitsa</namePart>
<namePart type="family">Kozareva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The way we talk carries information about who we are. Demographics, personality, clinical conditions, political preferences influence what we speak about and how, suggesting that many individual attributes could be inferred from adequate encodings of linguistic behavior. Conversely, conditioning text representations on author attributes has been shown to improve model performance in many NLP tasks. Previous research on individual differences and language representations has mainly focused on predicting selected attributes from text, or on conditioning text representations on such attributes for author-based contextualization. Here, we present a self-supervised approach to learning language-based user encodings using transformers. Using a large corpus of Reddit submissions, we fine-tune DistilBERT on user-based triplet loss. We show that fine-tuned models can pick up on complex linguistic signatures of users, and that they are able to infer rich information about them. Through a series of intrinsic analyses and probing tasks, we provide evidence that fine-tuning enhances models’ ability to abstract generalizable user information, which yields performance advantages for user-based downstream tasks. We discuss applications in language-based assessment and contextualized and personalized NLP.</abstract>
<identifier type="citekey">rocca-yarkoni-2022-language</identifier>
<identifier type="doi">10.18653/v1/2022.findings-emnlp.123</identifier>
<location>
<url>https://aclanthology.org/2022.findings-emnlp.123</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>1701</start>
<end>1714</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language as a fingerprint: Self-supervised learning of user encodings using transformers
%A Rocca, Roberta
%A Yarkoni, Tal
%Y Goldberg, Yoav
%Y Kozareva, Zornitsa
%Y Zhang, Yue
%S Findings of the Association for Computational Linguistics: EMNLP 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates
%F rocca-yarkoni-2022-language
%X The way we talk carries information about who we are. Demographics, personality, clinical conditions, political preferences influence what we speak about and how, suggesting that many individual attributes could be inferred from adequate encodings of linguistic behavior. Conversely, conditioning text representations on author attributes has been shown to improve model performance in many NLP tasks. Previous research on individual differences and language representations has mainly focused on predicting selected attributes from text, or on conditioning text representations on such attributes for author-based contextualization. Here, we present a self-supervised approach to learning language-based user encodings using transformers. Using a large corpus of Reddit submissions, we fine-tune DistilBERT on user-based triplet loss. We show that fine-tuned models can pick up on complex linguistic signatures of users, and that they are able to infer rich information about them. Through a series of intrinsic analyses and probing tasks, we provide evidence that fine-tuning enhances models’ ability to abstract generalizable user information, which yields performance advantages for user-based downstream tasks. We discuss applications in language-based assessment and contextualized and personalized NLP.
%R 10.18653/v1/2022.findings-emnlp.123
%U https://aclanthology.org/2022.findings-emnlp.123
%U https://doi.org/10.18653/v1/2022.findings-emnlp.123
%P 1701-1714
Markdown (Informal)
[Language as a fingerprint: Self-supervised learning of user encodings using transformers](https://aclanthology.org/2022.findings-emnlp.123) (Rocca & Yarkoni, Findings 2022)
ACL