@inproceedings{aguilar-canto-etal-2025-simulating,
title = "Simulating Complex Immediate Textual Variation with Large Language Models",
author = "Aguilar-Canto, Fernando and
Espinosa-Juarez, Alberto and
Calvo, Hiram",
editor = "Arachchige, Isuri Nanomi and
Frontini, Francesca and
Mitkov, Ruslan and
Rayson, Paul",
booktitle = "Proceedings of the First on Natural Language Processing and Language Models for Digital Humanities",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.lm4dh-1.2/",
pages = "25--31",
abstract = "Immediate Textual Variation (ITV) is defined as the process of introducing changes during text transmission from one node to another. One-step variation can be useful for testing specific philological hypotheses. In this paper, we propose using Large Language Models (LLMs) as text-modifying agents. We analyze three scenarios: (1) simple variations (omissions), (2) paraphrasing, and (3) paraphrasing with bias injection (polarity). We generate simulated news items using a predefined scheme. We hypothesize that central tendency measures{---}such as the mean and median vectors in the feature space of sentence transformers{---}can effectively approximate the original text representation. Our findings indicate that the median vector is a more accurate estimator of the original vector than most alternatives. However, in cases involving substantial rephrasing, the agent that produces the least semantic drift provides the best estimation, aligning with the principles of B{\'e}dierian textual criticism."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aguilar-canto-etal-2025-simulating">
<titleInfo>
<title>Simulating Complex Immediate Textual Variation with Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Aguilar-Canto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alberto</namePart>
<namePart type="family">Espinosa-Juarez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiram</namePart>
<namePart type="family">Calvo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First on Natural Language Processing and Language Models for Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Isuri</namePart>
<namePart type="given">Nanomi</namePart>
<namePart type="family">Arachchige</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesca</namePart>
<namePart type="family">Frontini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Rayson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Immediate Textual Variation (ITV) is defined as the process of introducing changes during text transmission from one node to another. One-step variation can be useful for testing specific philological hypotheses. In this paper, we propose using Large Language Models (LLMs) as text-modifying agents. We analyze three scenarios: (1) simple variations (omissions), (2) paraphrasing, and (3) paraphrasing with bias injection (polarity). We generate simulated news items using a predefined scheme. We hypothesize that central tendency measures—such as the mean and median vectors in the feature space of sentence transformers—can effectively approximate the original text representation. Our findings indicate that the median vector is a more accurate estimator of the original vector than most alternatives. However, in cases involving substantial rephrasing, the agent that produces the least semantic drift provides the best estimation, aligning with the principles of Bédierian textual criticism.</abstract>
<identifier type="citekey">aguilar-canto-etal-2025-simulating</identifier>
<location>
<url>https://aclanthology.org/2025.lm4dh-1.2/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>25</start>
<end>31</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Simulating Complex Immediate Textual Variation with Large Language Models
%A Aguilar-Canto, Fernando
%A Espinosa-Juarez, Alberto
%A Calvo, Hiram
%Y Arachchige, Isuri Nanomi
%Y Frontini, Francesca
%Y Mitkov, Ruslan
%Y Rayson, Paul
%S Proceedings of the First on Natural Language Processing and Language Models for Digital Humanities
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F aguilar-canto-etal-2025-simulating
%X Immediate Textual Variation (ITV) is defined as the process of introducing changes during text transmission from one node to another. One-step variation can be useful for testing specific philological hypotheses. In this paper, we propose using Large Language Models (LLMs) as text-modifying agents. We analyze three scenarios: (1) simple variations (omissions), (2) paraphrasing, and (3) paraphrasing with bias injection (polarity). We generate simulated news items using a predefined scheme. We hypothesize that central tendency measures—such as the mean and median vectors in the feature space of sentence transformers—can effectively approximate the original text representation. Our findings indicate that the median vector is a more accurate estimator of the original vector than most alternatives. However, in cases involving substantial rephrasing, the agent that produces the least semantic drift provides the best estimation, aligning with the principles of Bédierian textual criticism.
%U https://aclanthology.org/2025.lm4dh-1.2/
%P 25-31
Markdown (Informal)
[Simulating Complex Immediate Textual Variation with Large Language Models](https://aclanthology.org/2025.lm4dh-1.2/) (Aguilar-Canto et al., LM4DH 2025)
ACL