@inproceedings{soni-etal-2026-addressing,
title = "Addressing the Ecological Fallacy in Larger {LM}s with Human Context",
author = "Soni, Nikita and
Kunjadiya, Dhruv Vijay and
Shah, Pratham Piyush and
Mohanty, Dikshya and
Schwartz, H. Andrew and
Balasubramanian, Niranjan",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.conll-main.9/",
pages = "121--144",
ISBN = "979-8-89176-410-1",
abstract = "Language model training and inference ignore a fundamental linguistic fact: there is a dependence between multiple sequences of text written by the same person. Prior work has shown that addressing this form of \textit{ecological fallacy} can greatly improve the performance of multiple smaller ({\textasciitilde}124M) GPT-based models. In this work, we ask if addressing the ecological fallacy by modeling the author{'}s language context with a specific LM task (called HuLM) can provide similar benefits for a larger-scale model, an 8B Llama model. To this end, we explore variants that process an author{'}s language in the context of their other temporally ordered texts. We study the effect of pre-training with this author context using the HuLM objective, as well as using it during fine-tuning with author context (\textit{HuFT:Human-aware Fine-Tuning}). Empirical comparisons show that addressing the ecological fallacy during fine-tuning alone using QLoRA improves the performance of the larger 8B model over standard fine-tuning. Additionally, QLoRA-based continued HuLM pre-training results in a human-aware model generalizable for improved performance over eight downstream tasks with linear task classifier training alone. These results indicate the utility and importance of modeling language in the context of its original generators, the authors."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="soni-etal-2026-addressing">
<titleInfo>
<title>Addressing the Ecological Fallacy in Larger LMs with Human Context</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Soni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dhruv</namePart>
<namePart type="given">Vijay</namePart>
<namePart type="family">Kunjadiya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pratham</namePart>
<namePart type="given">Piyush</namePart>
<namePart type="family">Shah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dikshya</namePart>
<namePart type="family">Mohanty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">H</namePart>
<namePart type="given">Andrew</namePart>
<namePart type="family">Schwartz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Niranjan</namePart>
<namePart type="family">Balasubramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 30th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Bonial</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yevgeni</namePart>
<namePart type="family">Berzak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-410-1</identifier>
</relatedItem>
<abstract>Language model training and inference ignore a fundamental linguistic fact: there is a dependence between multiple sequences of text written by the same person. Prior work has shown that addressing this form of ecological fallacy can greatly improve the performance of multiple smaller (~124M) GPT-based models. In this work, we ask if addressing the ecological fallacy by modeling the author’s language context with a specific LM task (called HuLM) can provide similar benefits for a larger-scale model, an 8B Llama model. To this end, we explore variants that process an author’s language in the context of their other temporally ordered texts. We study the effect of pre-training with this author context using the HuLM objective, as well as using it during fine-tuning with author context (HuFT:Human-aware Fine-Tuning). Empirical comparisons show that addressing the ecological fallacy during fine-tuning alone using QLoRA improves the performance of the larger 8B model over standard fine-tuning. Additionally, QLoRA-based continued HuLM pre-training results in a human-aware model generalizable for improved performance over eight downstream tasks with linear task classifier training alone. These results indicate the utility and importance of modeling language in the context of its original generators, the authors.</abstract>
<identifier type="citekey">soni-etal-2026-addressing</identifier>
<location>
<url>https://aclanthology.org/2026.conll-main.9/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>121</start>
<end>144</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Addressing the Ecological Fallacy in Larger LMs with Human Context
%A Soni, Nikita
%A Kunjadiya, Dhruv Vijay
%A Shah, Pratham Piyush
%A Mohanty, Dikshya
%A Schwartz, H. Andrew
%A Balasubramanian, Niranjan
%Y Bonial, Claire
%Y Berzak, Yevgeni
%S Proceedings of the 30th Conference on Computational Natural Language Learning
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-410-1
%F soni-etal-2026-addressing
%X Language model training and inference ignore a fundamental linguistic fact: there is a dependence between multiple sequences of text written by the same person. Prior work has shown that addressing this form of ecological fallacy can greatly improve the performance of multiple smaller (~124M) GPT-based models. In this work, we ask if addressing the ecological fallacy by modeling the author’s language context with a specific LM task (called HuLM) can provide similar benefits for a larger-scale model, an 8B Llama model. To this end, we explore variants that process an author’s language in the context of their other temporally ordered texts. We study the effect of pre-training with this author context using the HuLM objective, as well as using it during fine-tuning with author context (HuFT:Human-aware Fine-Tuning). Empirical comparisons show that addressing the ecological fallacy during fine-tuning alone using QLoRA improves the performance of the larger 8B model over standard fine-tuning. Additionally, QLoRA-based continued HuLM pre-training results in a human-aware model generalizable for improved performance over eight downstream tasks with linear task classifier training alone. These results indicate the utility and importance of modeling language in the context of its original generators, the authors.
%U https://aclanthology.org/2026.conll-main.9/
%P 121-144
Markdown (Informal)
[Addressing the Ecological Fallacy in Larger LMs with Human Context](https://aclanthology.org/2026.conll-main.9/) (Soni et al., CoNLL 2026)
ACL
- Nikita Soni, Dhruv Vijay Kunjadiya, Pratham Piyush Shah, Dikshya Mohanty, H. Andrew Schwartz, and Niranjan Balasubramanian. 2026. Addressing the Ecological Fallacy in Larger LMs with Human Context. In Proceedings of the 30th Conference on Computational Natural Language Learning, pages 121–144, San Diego, California, USA. Association for Computational Linguistics.