@article{chim-etal-2025-evaluating,
title = "Evaluating Synthetic Data Generation from User Generated Text",
author = "Chim, Jenny and
Ive, Julia and
Liakata, Maria",
journal = "Computational Linguistics",
volume = "51",
number = "1",
month = mar,
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.cl-1.6/",
doi = "10.1162/coli_a_00540",
pages = "191--233",
abstract = "User-generated content provides a rich resource to study social and behavioral phenomena. Although its application potential is currently limited by the paucity of expert labels and the privacy risks inherent in personal data, synthetic data can help mitigate this bottleneck. In this work, we introduce an evaluation framework to facilitate research on synthetic language data generation for user-generated text. We define a set of aspects for assessing data quality, namely, style preservation, meaning preservation, and divergence, as a proxy for privacy. We introduce metrics corresponding to each aspect. Moreover, through a set of generation strategies and representative tasks and baselines across domains, we demonstrate the relation between the quality aspects of synthetic user generated content, generation strategies, metrics, and downstream performance. To our knowledge, our work is the first unified evaluation framework for user-generated text in relation to the specified aspects, offering both intrinsic and extrinsic evaluation. We envisage it will facilitate developments towards shareable, high-quality synthetic language data."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chim-etal-2025-evaluating">
<titleInfo>
<title>Evaluating Synthetic Data Generation from User Generated Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jenny</namePart>
<namePart type="family">Chim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Ive</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>User-generated content provides a rich resource to study social and behavioral phenomena. Although its application potential is currently limited by the paucity of expert labels and the privacy risks inherent in personal data, synthetic data can help mitigate this bottleneck. In this work, we introduce an evaluation framework to facilitate research on synthetic language data generation for user-generated text. We define a set of aspects for assessing data quality, namely, style preservation, meaning preservation, and divergence, as a proxy for privacy. We introduce metrics corresponding to each aspect. Moreover, through a set of generation strategies and representative tasks and baselines across domains, we demonstrate the relation between the quality aspects of synthetic user generated content, generation strategies, metrics, and downstream performance. To our knowledge, our work is the first unified evaluation framework for user-generated text in relation to the specified aspects, offering both intrinsic and extrinsic evaluation. We envisage it will facilitate developments towards shareable, high-quality synthetic language data.</abstract>
<identifier type="citekey">chim-etal-2025-evaluating</identifier>
<identifier type="doi">10.1162/coli_a_00540</identifier>
<location>
<url>https://aclanthology.org/2025.cl-1.6/</url>
</location>
<part>
<date>2025-03</date>
<detail type="volume"><number>51</number></detail>
<detail type="issue"><number>1</number></detail>
<extent unit="page">
<start>191</start>
<end>233</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Evaluating Synthetic Data Generation from User Generated Text
%A Chim, Jenny
%A Ive, Julia
%A Liakata, Maria
%J Computational Linguistics
%D 2025
%8 March
%V 51
%N 1
%I MIT Press
%C Cambridge, MA
%F chim-etal-2025-evaluating
%X User-generated content provides a rich resource to study social and behavioral phenomena. Although its application potential is currently limited by the paucity of expert labels and the privacy risks inherent in personal data, synthetic data can help mitigate this bottleneck. In this work, we introduce an evaluation framework to facilitate research on synthetic language data generation for user-generated text. We define a set of aspects for assessing data quality, namely, style preservation, meaning preservation, and divergence, as a proxy for privacy. We introduce metrics corresponding to each aspect. Moreover, through a set of generation strategies and representative tasks and baselines across domains, we demonstrate the relation between the quality aspects of synthetic user generated content, generation strategies, metrics, and downstream performance. To our knowledge, our work is the first unified evaluation framework for user-generated text in relation to the specified aspects, offering both intrinsic and extrinsic evaluation. We envisage it will facilitate developments towards shareable, high-quality synthetic language data.
%R 10.1162/coli_a_00540
%U https://aclanthology.org/2025.cl-1.6/
%U https://doi.org/10.1162/coli_a_00540
%P 191-233
Markdown (Informal)
[Evaluating Synthetic Data Generation from User Generated Text](https://aclanthology.org/2025.cl-1.6/) (Chim et al., CL 2025)
ACL