@inproceedings{vanterpool-aharodnik-2025-handcrafted,
title = "From Handcrafted Features to {LLM}s: A Comparative Study in Native Language Identification",
author = "Vanterpool, Aliyah C. and
Aharodnik, Katsiaryna",
editor = "Picazo-Izquierdo, Alicia and
Estevanell-Valladares, Ernesto Luis and
Mitkov, Ruslan and
Guillena, Rafael Mu{\~n}oz and
Cerd{\'a}, Ra{\'u}l Garc{\'i}a",
booktitle = "Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.r2lm-1.15/",
pages = "144--153",
abstract = "This study compares a traditional machine learning feature-engineering approach to a large language models (LLMs) fine-tuning method for Native Language Identification (NLI). We explored the COREFL corpus, which consists of L2 English narratives produced by Spanish and German L1 speakers with lower-advanced English proficiency (C1) (Lozano et al., 2020). For the feature-engineering approach, we extracted language productivity, linguistic diversity, and n-gram features for Support Vector Machine (SVM) classification. We also looked at sentence embeddings with SVM and logistic regression. For the LLM approach, we evaluated BERT-like models and GPT-4. The feature-engineering approach, particularly n-grams, outperformed the LLMs. Sentence-BERT embeddings with SVM achieved the second-highest accuracy (93{\%}), while GPT-4 reached an average accuracy of 90.4{\%} across three runs when prompted with labels. These findings suggest that feature engineering remains a robust method for NLI, especially for smaller datasets with subtle linguistic differences between classes. This study contributes to the comparative analysis of traditional machine learning and transformer-based LLMs, highlighting current LLM limitations in handling domain-specific data and their need for larger training resources."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="vanterpool-aharodnik-2025-handcrafted">
<titleInfo>
<title>From Handcrafted Features to LLMs: A Comparative Study in Native Language Identification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aliyah</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Vanterpool</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Katsiaryna</namePart>
<namePart type="family">Aharodnik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Picazo-Izquierdo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ernesto</namePart>
<namePart type="given">Luis</namePart>
<namePart type="family">Estevanell-Valladares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="given">Muñoz</namePart>
<namePart type="family">Guillena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raúl</namePart>
<namePart type="given">García</namePart>
<namePart type="family">Cerdá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study compares a traditional machine learning feature-engineering approach to a large language models (LLMs) fine-tuning method for Native Language Identification (NLI). We explored the COREFL corpus, which consists of L2 English narratives produced by Spanish and German L1 speakers with lower-advanced English proficiency (C1) (Lozano et al., 2020). For the feature-engineering approach, we extracted language productivity, linguistic diversity, and n-gram features for Support Vector Machine (SVM) classification. We also looked at sentence embeddings with SVM and logistic regression. For the LLM approach, we evaluated BERT-like models and GPT-4. The feature-engineering approach, particularly n-grams, outperformed the LLMs. Sentence-BERT embeddings with SVM achieved the second-highest accuracy (93%), while GPT-4 reached an average accuracy of 90.4% across three runs when prompted with labels. These findings suggest that feature engineering remains a robust method for NLI, especially for smaller datasets with subtle linguistic differences between classes. This study contributes to the comparative analysis of traditional machine learning and transformer-based LLMs, highlighting current LLM limitations in handling domain-specific data and their need for larger training resources.</abstract>
<identifier type="citekey">vanterpool-aharodnik-2025-handcrafted</identifier>
<location>
<url>https://aclanthology.org/2025.r2lm-1.15/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>144</start>
<end>153</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Handcrafted Features to LLMs: A Comparative Study in Native Language Identification
%A Vanterpool, Aliyah C.
%A Aharodnik, Katsiaryna
%Y Picazo-Izquierdo, Alicia
%Y Estevanell-Valladares, Ernesto Luis
%Y Mitkov, Ruslan
%Y Guillena, Rafael Muñoz
%Y Cerdá, Raúl García
%S Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F vanterpool-aharodnik-2025-handcrafted
%X This study compares a traditional machine learning feature-engineering approach to a large language models (LLMs) fine-tuning method for Native Language Identification (NLI). We explored the COREFL corpus, which consists of L2 English narratives produced by Spanish and German L1 speakers with lower-advanced English proficiency (C1) (Lozano et al., 2020). For the feature-engineering approach, we extracted language productivity, linguistic diversity, and n-gram features for Support Vector Machine (SVM) classification. We also looked at sentence embeddings with SVM and logistic regression. For the LLM approach, we evaluated BERT-like models and GPT-4. The feature-engineering approach, particularly n-grams, outperformed the LLMs. Sentence-BERT embeddings with SVM achieved the second-highest accuracy (93%), while GPT-4 reached an average accuracy of 90.4% across three runs when prompted with labels. These findings suggest that feature engineering remains a robust method for NLI, especially for smaller datasets with subtle linguistic differences between classes. This study contributes to the comparative analysis of traditional machine learning and transformer-based LLMs, highlighting current LLM limitations in handling domain-specific data and their need for larger training resources.
%U https://aclanthology.org/2025.r2lm-1.15/
%P 144-153
Markdown (Informal)
[From Handcrafted Features to LLMs: A Comparative Study in Native Language Identification](https://aclanthology.org/2025.r2lm-1.15/) (Vanterpool & Aharodnik, R2LM 2025)
ACL