@inproceedings{naik-etal-2025-programming,
title = "Programming by Example meets Historical Linguistics: A Large Language Model Based Approach to Sound Law Induction",
author = "Naik, Atharva and
Agrawal, Darsh and
Sng, Hong and
Marr, Clayton and
Zhang, Kexun and
Robinson, Nathaniel Romney and
Chang, Kalvin and
Byrnes, Rebecca and
Mysore, Aravind and
Rose, Carolyn and
Mortensen, David R.",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.1432/",
doi = "10.18653/v1/2025.acl-long.1432",
pages = "29628--29647",
ISBN = "979-8-89176-251-0",
abstract = "Historical linguists have long written ``programs'' that convert reconstructed words in an ancestor language into their attested descendants via ordered string rewrite functions (called sound laws) However, writing these programs is time-consuming, motivating the development of automated Sound Law Induction (SLI) which we formulate as Programming by Examples (PBE) with Large Language Models (LLMs) in this paper. While LLMs have been effective for code generation, recent work has shown that PBE is challenging but improvable by fine-tuning, especially with training data drawn from the same distribution as evaluation data. In this paper, we create a conceptual framework of what constitutes a ``similar distribution'' for SLI and propose four kinds of synthetic data generation methods with varying amounts of inductive bias to investigate what leads to the best performance. Based on the results, we create a SOTA open-source model for SLI as PBE (+6{\%} pass rate with a third of the parameters of the second-best LLM) and also highlight exciting future directions for PBE research."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="naik-etal-2025-programming">
<titleInfo>
<title>Programming by Example meets Historical Linguistics: A Large Language Model Based Approach to Sound Law Induction</title>
</titleInfo>
<name type="personal">
<namePart type="given">Atharva</namePart>
<namePart type="family">Naik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Darsh</namePart>
<namePart type="family">Agrawal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Sng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clayton</namePart>
<namePart type="family">Marr</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kexun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nathaniel</namePart>
<namePart type="given">Romney</namePart>
<namePart type="family">Robinson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalvin</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Byrnes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aravind</namePart>
<namePart type="family">Mysore</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Mortensen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Historical linguists have long written “programs” that convert reconstructed words in an ancestor language into their attested descendants via ordered string rewrite functions (called sound laws) However, writing these programs is time-consuming, motivating the development of automated Sound Law Induction (SLI) which we formulate as Programming by Examples (PBE) with Large Language Models (LLMs) in this paper. While LLMs have been effective for code generation, recent work has shown that PBE is challenging but improvable by fine-tuning, especially with training data drawn from the same distribution as evaluation data. In this paper, we create a conceptual framework of what constitutes a “similar distribution” for SLI and propose four kinds of synthetic data generation methods with varying amounts of inductive bias to investigate what leads to the best performance. Based on the results, we create a SOTA open-source model for SLI as PBE (+6% pass rate with a third of the parameters of the second-best LLM) and also highlight exciting future directions for PBE research.</abstract>
<identifier type="citekey">naik-etal-2025-programming</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.1432</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.1432/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>29628</start>
<end>29647</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Programming by Example meets Historical Linguistics: A Large Language Model Based Approach to Sound Law Induction
%A Naik, Atharva
%A Agrawal, Darsh
%A Sng, Hong
%A Marr, Clayton
%A Zhang, Kexun
%A Robinson, Nathaniel Romney
%A Chang, Kalvin
%A Byrnes, Rebecca
%A Mysore, Aravind
%A Rose, Carolyn
%A Mortensen, David R.
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F naik-etal-2025-programming
%X Historical linguists have long written “programs” that convert reconstructed words in an ancestor language into their attested descendants via ordered string rewrite functions (called sound laws) However, writing these programs is time-consuming, motivating the development of automated Sound Law Induction (SLI) which we formulate as Programming by Examples (PBE) with Large Language Models (LLMs) in this paper. While LLMs have been effective for code generation, recent work has shown that PBE is challenging but improvable by fine-tuning, especially with training data drawn from the same distribution as evaluation data. In this paper, we create a conceptual framework of what constitutes a “similar distribution” for SLI and propose four kinds of synthetic data generation methods with varying amounts of inductive bias to investigate what leads to the best performance. Based on the results, we create a SOTA open-source model for SLI as PBE (+6% pass rate with a third of the parameters of the second-best LLM) and also highlight exciting future directions for PBE research.
%R 10.18653/v1/2025.acl-long.1432
%U https://aclanthology.org/2025.acl-long.1432/
%U https://doi.org/10.18653/v1/2025.acl-long.1432
%P 29628-29647
Markdown (Informal)
[Programming by Example meets Historical Linguistics: A Large Language Model Based Approach to Sound Law Induction](https://aclanthology.org/2025.acl-long.1432/) (Naik et al., ACL 2025)
ACL
- Atharva Naik, Darsh Agrawal, Hong Sng, Clayton Marr, Kexun Zhang, Nathaniel Romney Robinson, Kalvin Chang, Rebecca Byrnes, Aravind Mysore, Carolyn Rose, and David R. Mortensen. 2025. Programming by Example meets Historical Linguistics: A Large Language Model Based Approach to Sound Law Induction. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 29628–29647, Vienna, Austria. Association for Computational Linguistics.