@inproceedings{singhal-etal-2024-geoindia,
title = "{G}eo{I}ndia: A {S}eq2{S}eq Geocoding Approach for {I}ndian Addresses",
author = "Singhal, Bhavuk and
Aditya, Anshu and
Todwal, Lokesh and
Jain, Shubham and
Mukherjee, Debashis",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.29",
pages = "395--407",
abstract = "Geocoding, the conversion of unstructured geographic text into structured spatial data, is essential for logistics, urban planning, and location-based services. Indian addresses with their diverse languages, scripts, and formats present significant challenges that existing geocoding methods often fail to address, particularly at fine-grained resolutions. In this paper, we propose GeoIndia, a novel geocoding system designed specifically for Indian addresses using hierarchical H3-cell prediction within a Seq2Seq framework. Our methodology includes a comprehensive analysis of Indian addressing systems, leading to the development of a data correction strategy that enhances prediction accuracy. We investigate two model architectures, Flan-T5-base (T5) and Llama-3-8b (QLF-Llama-3), due to their strong sequence generation capabilities. We trained around 29 models with one dedicated to each state, and results show that our approach provides superior accuracy and reliability across multiple Indian states, outperforming the well-renowned geocoding platform Google Maps. In multiple states, we achieved more than an 50{\%} reduction in mean distance error and more than a 85{\%} reduction in 99th percentile distance error compared to Google Maps. This advancement can help in optimizing logistics in the e-commerce sector, reducing delivery failures and improving customer satisfaction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="singhal-etal-2024-geoindia">
<titleInfo>
<title>GeoIndia: A Seq2Seq Geocoding Approach for Indian Addresses</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bhavuk</namePart>
<namePart type="family">Singhal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anshu</namePart>
<namePart type="family">Aditya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lokesh</namePart>
<namePart type="family">Todwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shubham</namePart>
<namePart type="family">Jain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Debashis</namePart>
<namePart type="family">Mukherjee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Geocoding, the conversion of unstructured geographic text into structured spatial data, is essential for logistics, urban planning, and location-based services. Indian addresses with their diverse languages, scripts, and formats present significant challenges that existing geocoding methods often fail to address, particularly at fine-grained resolutions. In this paper, we propose GeoIndia, a novel geocoding system designed specifically for Indian addresses using hierarchical H3-cell prediction within a Seq2Seq framework. Our methodology includes a comprehensive analysis of Indian addressing systems, leading to the development of a data correction strategy that enhances prediction accuracy. We investigate two model architectures, Flan-T5-base (T5) and Llama-3-8b (QLF-Llama-3), due to their strong sequence generation capabilities. We trained around 29 models with one dedicated to each state, and results show that our approach provides superior accuracy and reliability across multiple Indian states, outperforming the well-renowned geocoding platform Google Maps. In multiple states, we achieved more than an 50% reduction in mean distance error and more than a 85% reduction in 99th percentile distance error compared to Google Maps. This advancement can help in optimizing logistics in the e-commerce sector, reducing delivery failures and improving customer satisfaction.</abstract>
<identifier type="citekey">singhal-etal-2024-geoindia</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.29</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>395</start>
<end>407</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GeoIndia: A Seq2Seq Geocoding Approach for Indian Addresses
%A Singhal, Bhavuk
%A Aditya, Anshu
%A Todwal, Lokesh
%A Jain, Shubham
%A Mukherjee, Debashis
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F singhal-etal-2024-geoindia
%X Geocoding, the conversion of unstructured geographic text into structured spatial data, is essential for logistics, urban planning, and location-based services. Indian addresses with their diverse languages, scripts, and formats present significant challenges that existing geocoding methods often fail to address, particularly at fine-grained resolutions. In this paper, we propose GeoIndia, a novel geocoding system designed specifically for Indian addresses using hierarchical H3-cell prediction within a Seq2Seq framework. Our methodology includes a comprehensive analysis of Indian addressing systems, leading to the development of a data correction strategy that enhances prediction accuracy. We investigate two model architectures, Flan-T5-base (T5) and Llama-3-8b (QLF-Llama-3), due to their strong sequence generation capabilities. We trained around 29 models with one dedicated to each state, and results show that our approach provides superior accuracy and reliability across multiple Indian states, outperforming the well-renowned geocoding platform Google Maps. In multiple states, we achieved more than an 50% reduction in mean distance error and more than a 85% reduction in 99th percentile distance error compared to Google Maps. This advancement can help in optimizing logistics in the e-commerce sector, reducing delivery failures and improving customer satisfaction.
%U https://aclanthology.org/2024.emnlp-industry.29
%P 395-407
Markdown (Informal)
[GeoIndia: A Seq2Seq Geocoding Approach for Indian Addresses](https://aclanthology.org/2024.emnlp-industry.29) (Singhal et al., EMNLP 2024)
ACL
- Bhavuk Singhal, Anshu Aditya, Lokesh Todwal, Shubham Jain, and Debashis Mukherjee. 2024. GeoIndia: A Seq2Seq Geocoding Approach for Indian Addresses. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track, pages 395–407, Miami, Florida, US. Association for Computational Linguistics.