@inproceedings{kuwanto-etal-2026-linguistics,
title = "Linguistics Theory Meets {LLM}: Code-Switched Text Generation via Equivalence Constrained Large Language Models",
author = "Kuwanto, Garry and
Agarwal, Chaitanya and
Winata, Genta Indra and
Wijaya, Derry Tanti",
editor = "Ma, Martin Ziqiao and
Liu, Emmy and
Liu, Jing and
Chang, Tyler A. and
Fourtassi, Abdellah and
Warstadt, Alex and
Hahn, Michael and
Sun, Weiwei and
Shi, Freda",
booktitle = "Proceedings of the 1st Workshop on Computational Developmental Linguistics ({CDL})",
month = jul,
year = "2026",
address = "Grand Hyatt Manchester San Diego, 1 Market Pl, San Diego, CA 92101",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.cdl-1.1/",
pages = "1--14",
ISBN = "979-8-89176-428-6",
abstract = "Code-switching is a common practice for millions of multilingual speakers but remains challenging for Large Language Models (LLMs). This paper investigates LLM capabilities in generating code-switched text, conducting extensive experiments across five diverse language pairs: English paired with Hindi, Tamil, Malayalam, and Indonesian, as well as Indonesian-Javanese. Our analysis, grounded in comprehensive human evaluations by native speakers, uncovers a directional asymmetry: LLMs consistently produce higher-quality (more accurate and fluent) code-switched text when prompted with a lower-resource language (e.g., Hindi, Tamil, Javanese) as the source, compared to when a higher-resource language (English, Indonesian) serves as the source. This asymmetry mirrors sociolinguistic patterns, particularly the Matrix Language Frame model, suggesting LLMs implicitly learn common code-switching structures from their training data where regional languages often form the grammatical base. Furthermore, we find that explicit linguistic guidance, applied through Equivalence Constraint Theory (ECT) to identify switching points, primarily benefits generation quality only in the less common, higher-resource-source direction where LLMs intrinsically struggle. These findings highlight a crucial interplay between the implicit linguistic knowledge captured by LLMs and the targeted utility of explicit linguistic constraints. We also introduce CSPref, a pairwise preference dataset derived from our human evaluations, to facilitate future research in code-switching generation and evaluation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuwanto-etal-2026-linguistics">
<titleInfo>
<title>Linguistics Theory Meets LLM: Code-Switched Text Generation via Equivalence Constrained Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Garry</namePart>
<namePart type="family">Kuwanto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chaitanya</namePart>
<namePart type="family">Agarwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="given">Indra</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Derry</namePart>
<namePart type="given">Tanti</namePart>
<namePart type="family">Wijaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Computational Developmental Linguistics (CDL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="given">Ziqiao</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmy</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tyler</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdellah</namePart>
<namePart type="family">Fourtassi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Hahn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weiwei</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Freda</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Grand Hyatt Manchester San Diego, 1 Market Pl, San Diego, CA 92101</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-428-6</identifier>
</relatedItem>
<abstract>Code-switching is a common practice for millions of multilingual speakers but remains challenging for Large Language Models (LLMs). This paper investigates LLM capabilities in generating code-switched text, conducting extensive experiments across five diverse language pairs: English paired with Hindi, Tamil, Malayalam, and Indonesian, as well as Indonesian-Javanese. Our analysis, grounded in comprehensive human evaluations by native speakers, uncovers a directional asymmetry: LLMs consistently produce higher-quality (more accurate and fluent) code-switched text when prompted with a lower-resource language (e.g., Hindi, Tamil, Javanese) as the source, compared to when a higher-resource language (English, Indonesian) serves as the source. This asymmetry mirrors sociolinguistic patterns, particularly the Matrix Language Frame model, suggesting LLMs implicitly learn common code-switching structures from their training data where regional languages often form the grammatical base. Furthermore, we find that explicit linguistic guidance, applied through Equivalence Constraint Theory (ECT) to identify switching points, primarily benefits generation quality only in the less common, higher-resource-source direction where LLMs intrinsically struggle. These findings highlight a crucial interplay between the implicit linguistic knowledge captured by LLMs and the targeted utility of explicit linguistic constraints. We also introduce CSPref, a pairwise preference dataset derived from our human evaluations, to facilitate future research in code-switching generation and evaluation.</abstract>
<identifier type="citekey">kuwanto-etal-2026-linguistics</identifier>
<location>
<url>https://aclanthology.org/2026.cdl-1.1/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>1</start>
<end>14</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Linguistics Theory Meets LLM: Code-Switched Text Generation via Equivalence Constrained Large Language Models
%A Kuwanto, Garry
%A Agarwal, Chaitanya
%A Winata, Genta Indra
%A Wijaya, Derry Tanti
%Y Ma, Martin Ziqiao
%Y Liu, Emmy
%Y Liu, Jing
%Y Chang, Tyler A.
%Y Fourtassi, Abdellah
%Y Warstadt, Alex
%Y Hahn, Michael
%Y Sun, Weiwei
%Y Shi, Freda
%S Proceedings of the 1st Workshop on Computational Developmental Linguistics (CDL)
%D 2026
%8 July
%I Association for Computational Linguistics
%C Grand Hyatt Manchester San Diego, 1 Market Pl, San Diego, CA 92101
%@ 979-8-89176-428-6
%F kuwanto-etal-2026-linguistics
%X Code-switching is a common practice for millions of multilingual speakers but remains challenging for Large Language Models (LLMs). This paper investigates LLM capabilities in generating code-switched text, conducting extensive experiments across five diverse language pairs: English paired with Hindi, Tamil, Malayalam, and Indonesian, as well as Indonesian-Javanese. Our analysis, grounded in comprehensive human evaluations by native speakers, uncovers a directional asymmetry: LLMs consistently produce higher-quality (more accurate and fluent) code-switched text when prompted with a lower-resource language (e.g., Hindi, Tamil, Javanese) as the source, compared to when a higher-resource language (English, Indonesian) serves as the source. This asymmetry mirrors sociolinguistic patterns, particularly the Matrix Language Frame model, suggesting LLMs implicitly learn common code-switching structures from their training data where regional languages often form the grammatical base. Furthermore, we find that explicit linguistic guidance, applied through Equivalence Constraint Theory (ECT) to identify switching points, primarily benefits generation quality only in the less common, higher-resource-source direction where LLMs intrinsically struggle. These findings highlight a crucial interplay between the implicit linguistic knowledge captured by LLMs and the targeted utility of explicit linguistic constraints. We also introduce CSPref, a pairwise preference dataset derived from our human evaluations, to facilitate future research in code-switching generation and evaluation.
%U https://aclanthology.org/2026.cdl-1.1/
%P 1-14
Markdown (Informal)
[Linguistics Theory Meets LLM: Code-Switched Text Generation via Equivalence Constrained Large Language Models](https://aclanthology.org/2026.cdl-1.1/) (Kuwanto et al., CDL 2026)
ACL