@inproceedings{milicka-machalek-2026-ai-corpus,
title = "{AI} Corpus Linguist: More than a Year of Experience",
author = "Mili{\v{c}}ka, Ji{\v{r}}{\'i} and
Mach{\'a}lek, Tom{\'a}{\v{s}}",
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.latechclfl-1.29/",
pages = "305--310",
ISBN = "979-8-89176-373-9",
abstract = "We present an AI assistant designed to help researchers interact with language corpora using natural language instead of formal query languages. Built as a custom GPT with access to multilingual corpora via Czech National Corpus platform API, the system translates research questions into CQL queries, retrieves corpus data, and guides users through linguistic analysis. After more than a year of deployment, the system has processed over 1000 interactions with human users. We discuss the hybrid approach combining rule-based translation with LLM intelligence, challenges of building on a constantly evolving platform, and lessons learned from production usage. Notably, this system represents the first voice-enabled corpus interface in history, significantly lowering barriers to corpus-based research for non-technical users and users outside linguistic fields."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="milicka-machalek-2026-ai-corpus">
<titleInfo>
<title>AI Corpus Linguist: More than a Year of Experience</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiří</namePart>
<namePart type="family">Milička</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomáš</namePart>
<namePart type="family">Machálek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Diego</namePart>
<namePart type="family">Alves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stefania</namePart>
<namePart type="family">Degaetano-Ortlieb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Kazantseva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janis</namePart>
<namePart type="family">Pagel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stan</namePart>
<namePart type="family">Szpakowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-373-9</identifier>
</relatedItem>
<abstract>We present an AI assistant designed to help researchers interact with language corpora using natural language instead of formal query languages. Built as a custom GPT with access to multilingual corpora via Czech National Corpus platform API, the system translates research questions into CQL queries, retrieves corpus data, and guides users through linguistic analysis. After more than a year of deployment, the system has processed over 1000 interactions with human users. We discuss the hybrid approach combining rule-based translation with LLM intelligence, challenges of building on a constantly evolving platform, and lessons learned from production usage. Notably, this system represents the first voice-enabled corpus interface in history, significantly lowering barriers to corpus-based research for non-technical users and users outside linguistic fields.</abstract>
<identifier type="citekey">milicka-machalek-2026-ai-corpus</identifier>
<location>
<url>https://aclanthology.org/2026.latechclfl-1.29/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>305</start>
<end>310</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AI Corpus Linguist: More than a Year of Experience
%A Milička, Jiří
%A Machálek, Tomáš
%Y Alves, Diego
%Y Bizzoni, Yuri
%Y Degaetano-Ortlieb, Stefania
%Y Kazantseva, Anna
%Y Pagel, Janis
%Y Szpakowicz, Stan
%S Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-373-9
%F milicka-machalek-2026-ai-corpus
%X We present an AI assistant designed to help researchers interact with language corpora using natural language instead of formal query languages. Built as a custom GPT with access to multilingual corpora via Czech National Corpus platform API, the system translates research questions into CQL queries, retrieves corpus data, and guides users through linguistic analysis. After more than a year of deployment, the system has processed over 1000 interactions with human users. We discuss the hybrid approach combining rule-based translation with LLM intelligence, challenges of building on a constantly evolving platform, and lessons learned from production usage. Notably, this system represents the first voice-enabled corpus interface in history, significantly lowering barriers to corpus-based research for non-technical users and users outside linguistic fields.
%U https://aclanthology.org/2026.latechclfl-1.29/
%P 305-310
Markdown (Informal)
[AI Corpus Linguist: More than a Year of Experience](https://aclanthology.org/2026.latechclfl-1.29/) (Milička & Machálek, LaTeCH-CLfL 2026)
ACL
- Jiří Milička and Tomáš Machálek. 2026. AI Corpus Linguist: More than a Year of Experience. In Proceedings of the 10th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026, pages 305–310, Rabat, Morocco. Association for Computational Linguistics.