@inproceedings{booshanam-etal-2025-spoken,
title = "Spoken Document Retrieval for an Unwritten Language: A Case Study on Gormati",
author = "Booshanam, Sanjay and
Chen, Kelly and
Klejch, Ondrej and
Reitmaier, Thomas and
Raju, Dani Kalarikalayil and
Wallington, Electra and
Markl, Nina and
Pearson, Jennifer and
Jones, Matt and
Robinson, Simon and
Bell, Peter",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-emnlp.1224/",
doi = "10.18653/v1/2025.findings-emnlp.1224",
pages = "22497--22509",
ISBN = "979-8-89176-335-7",
abstract = "Speakers of unwritten languages have the potential to benefit from speech-based automatic information retrieval systems. This paper proposes a speech embedding technique that facilitates such a system that we can be used in a zero-shot manner on the target language. After conducting development experiments on several written Indic languages, we evaluate our method on a corpus of Gormati {--} an unwritten language {--} that was previously collected in partnership with an agrarian Banjara community in Maharashtra State, India, specifically for the purposes of information retrieval. Our system achieves a Top 5 retrieval rate of 87.9{\%} on this data, giving the hope that it may be useable by unwritten language speakers worldwide."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="booshanam-etal-2025-spoken">
<titleInfo>
<title>Spoken Document Retrieval for an Unwritten Language: A Case Study on Gormati</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sanjay</namePart>
<namePart type="family">Booshanam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kelly</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Klejch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thomas</namePart>
<namePart type="family">Reitmaier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dani</namePart>
<namePart type="given">Kalarikalayil</namePart>
<namePart type="family">Raju</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Electra</namePart>
<namePart type="family">Wallington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nina</namePart>
<namePart type="family">Markl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Pearson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Jones</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Robinson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Bell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-335-7</identifier>
</relatedItem>
<abstract>Speakers of unwritten languages have the potential to benefit from speech-based automatic information retrieval systems. This paper proposes a speech embedding technique that facilitates such a system that we can be used in a zero-shot manner on the target language. After conducting development experiments on several written Indic languages, we evaluate our method on a corpus of Gormati – an unwritten language – that was previously collected in partnership with an agrarian Banjara community in Maharashtra State, India, specifically for the purposes of information retrieval. Our system achieves a Top 5 retrieval rate of 87.9% on this data, giving the hope that it may be useable by unwritten language speakers worldwide.</abstract>
<identifier type="citekey">booshanam-etal-2025-spoken</identifier>
<identifier type="doi">10.18653/v1/2025.findings-emnlp.1224</identifier>
<location>
<url>https://aclanthology.org/2025.findings-emnlp.1224/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>22497</start>
<end>22509</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Spoken Document Retrieval for an Unwritten Language: A Case Study on Gormati
%A Booshanam, Sanjay
%A Chen, Kelly
%A Klejch, Ondrej
%A Reitmaier, Thomas
%A Raju, Dani Kalarikalayil
%A Wallington, Electra
%A Markl, Nina
%A Pearson, Jennifer
%A Jones, Matt
%A Robinson, Simon
%A Bell, Peter
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Findings of the Association for Computational Linguistics: EMNLP 2025
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-335-7
%F booshanam-etal-2025-spoken
%X Speakers of unwritten languages have the potential to benefit from speech-based automatic information retrieval systems. This paper proposes a speech embedding technique that facilitates such a system that we can be used in a zero-shot manner on the target language. After conducting development experiments on several written Indic languages, we evaluate our method on a corpus of Gormati – an unwritten language – that was previously collected in partnership with an agrarian Banjara community in Maharashtra State, India, specifically for the purposes of information retrieval. Our system achieves a Top 5 retrieval rate of 87.9% on this data, giving the hope that it may be useable by unwritten language speakers worldwide.
%R 10.18653/v1/2025.findings-emnlp.1224
%U https://aclanthology.org/2025.findings-emnlp.1224/
%U https://doi.org/10.18653/v1/2025.findings-emnlp.1224
%P 22497-22509
Markdown (Informal)
[Spoken Document Retrieval for an Unwritten Language: A Case Study on Gormati](https://aclanthology.org/2025.findings-emnlp.1224/) (Booshanam et al., Findings 2025)
ACL
- Sanjay Booshanam, Kelly Chen, Ondrej Klejch, Thomas Reitmaier, Dani Kalarikalayil Raju, Electra Wallington, Nina Markl, Jennifer Pearson, Matt Jones, Simon Robinson, and Peter Bell. 2025. Spoken Document Retrieval for an Unwritten Language: A Case Study on Gormati. In Findings of the Association for Computational Linguistics: EMNLP 2025, pages 22497–22509, Suzhou, China. Association for Computational Linguistics.