@inproceedings{chooi-2026-stylistic-transfer,
title = "Stylistic Transfer from Annotator Communities to Large Language Models",
author = "Chooi, Jay",
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.latechclfl-1.13/",
pages = "135--145",
ISBN = "979-8-89176-373-9",
abstract = "Large language models (LLMs) are post-trained on human feedback collected from annotator communities, yet the linguistic influence of these annotator communities on language models remains poorly understood. We investigated the stylistic transfer from Nigerian annotators to the LLaMA family of models through a natural experiment with LLaMA 2 and LLaMA 3.1, as their release dates are separated by the shutdown of a major data annotation service provider in Nigeria. We generated corpora from both model families and measured linguistic style by computing the difference-in-difference of the Jensen-Shannon distance on the bigram distribution between model outputs and corpora of Nigerian English and US English. We found that, although both pre-trained model variants exhibit similar proximity to both English variants, the LLaMA 2 post-trained model moved toward Nigerian English, while the LLaMA 3.1 post-trained model moved away from Nigerian English. Qualitatively, we found that post-trained LLaMA 2 models used significantly fewer contractions, in line with Nigerian English speakers opting to use a formal register due to its role as an index of knowledgeability. Our findings suggest that annotator communities can imprint linguistic style on large language models, with potential implications such as a disproportionately higher false positive rate in AI plagiarism detection for users who share a linguistic style with annotator communities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chooi-2026-stylistic-transfer">
<titleInfo>
<title>Stylistic Transfer from Annotator Communities to Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Chooi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Alves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janis</namePart>
<namePart type="family">Pagel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-373-9</identifier>
</relatedItem>
<abstract>Large language models (LLMs) are post-trained on human feedback collected from annotator communities, yet the linguistic influence of these annotator communities on language models remains poorly understood. We investigated the stylistic transfer from Nigerian annotators to the LLaMA family of models through a natural experiment with LLaMA 2 and LLaMA 3.1, as their release dates are separated by the shutdown of a major data annotation service provider in Nigeria. We generated corpora from both model families and measured linguistic style by computing the difference-in-difference of the Jensen-Shannon distance on the bigram distribution between model outputs and corpora of Nigerian English and US English. We found that, although both pre-trained model variants exhibit similar proximity to both English variants, the LLaMA 2 post-trained model moved toward Nigerian English, while the LLaMA 3.1 post-trained model moved away from Nigerian English. Qualitatively, we found that post-trained LLaMA 2 models used significantly fewer contractions, in line with Nigerian English speakers opting to use a formal register due to its role as an index of knowledgeability. Our findings suggest that annotator communities can imprint linguistic style on large language models, with potential implications such as a disproportionately higher false positive rate in AI plagiarism detection for users who share a linguistic style with annotator communities.</abstract>
<identifier type="citekey">chooi-2026-stylistic-transfer</identifier>
<location>
<url>https://aclanthology.org/2026.latechclfl-1.13/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>135</start>
<end>145</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Stylistic Transfer from Annotator Communities to Large Language Models
%A Chooi, Jay
%Y Alves, Diego
%Y Bizzoni, Yuri
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Pagel, Janis
%Y Szpakowicz, Stan
%S Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-373-9
%F chooi-2026-stylistic-transfer
%X Large language models (LLMs) are post-trained on human feedback collected from annotator communities, yet the linguistic influence of these annotator communities on language models remains poorly understood. We investigated the stylistic transfer from Nigerian annotators to the LLaMA family of models through a natural experiment with LLaMA 2 and LLaMA 3.1, as their release dates are separated by the shutdown of a major data annotation service provider in Nigeria. We generated corpora from both model families and measured linguistic style by computing the difference-in-difference of the Jensen-Shannon distance on the bigram distribution between model outputs and corpora of Nigerian English and US English. We found that, although both pre-trained model variants exhibit similar proximity to both English variants, the LLaMA 2 post-trained model moved toward Nigerian English, while the LLaMA 3.1 post-trained model moved away from Nigerian English. Qualitatively, we found that post-trained LLaMA 2 models used significantly fewer contractions, in line with Nigerian English speakers opting to use a formal register due to its role as an index of knowledgeability. Our findings suggest that annotator communities can imprint linguistic style on large language models, with potential implications such as a disproportionately higher false positive rate in AI plagiarism detection for users who share a linguistic style with annotator communities.
%U https://aclanthology.org/2026.latechclfl-1.13/
%P 135-145
Markdown (Informal)
[Stylistic Transfer from Annotator Communities to Large Language Models](https://aclanthology.org/2026.latechclfl-1.13/) (Chooi, LaTeCH-CLfL 2026)
ACL