@inproceedings{maheshwary-etal-2024-pretraining,
title = "Pretraining and Finetuning Language Models on Geospatial Networks for Accurate Address Matching",
author = "Maheshwary, Saket and
Paul, Arpan and
Sohoney, Saurabh",
editor = "Dernoncourt, Franck and
Preo{\c{t}}iuc-Pietro, Daniel and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-industry.58",
pages = "763--773",
abstract = "We propose a novel framework for pretraining and fine-tuning language models with the goal of determining whether two addresses represent the same physical building. Address matching and building authoritative address catalogues are important to many applications and businesses, such as delivery services, online retail, emergency services, logistics, etc. We propose to view a collection of addresses as an address graph and curate inputs for language models by placing geospatially linked addresses in the same context. Our approach jointly integrates concepts from graph theory and weak supervision with address text and geospatial semantics. This integration enables us to generate informative and diverse address pairs, facilitating pretraining and fine-tuning in a self-supervised manner. Experiments and ablation studies on manually curated datasets and comparisons with state-of-the-art techniques demonstrate the efficacy of our approach. We achieve a 24.49{\%} improvement in recall while maintaining 95{\%} precision on average, in comparison to the current baseline across multiple geographies. Further, we deploy our proposed approach and show the positive impact of improving address matching on geocode learning.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="maheshwary-etal-2024-pretraining">
<titleInfo>
<title>Pretraining and Finetuning Language Models on Geospatial Networks for Accurate Address Matching</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saket</namePart>
<namePart type="family">Maheshwary</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arpan</namePart>
<namePart type="family">Paul</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saurabh</namePart>
<namePart type="family">Sohoney</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track</title>
</titleInfo>
<name type="personal">
<namePart type="given">Franck</namePart>
<namePart type="family">Dernoncourt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, US</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a novel framework for pretraining and fine-tuning language models with the goal of determining whether two addresses represent the same physical building. Address matching and building authoritative address catalogues are important to many applications and businesses, such as delivery services, online retail, emergency services, logistics, etc. We propose to view a collection of addresses as an address graph and curate inputs for language models by placing geospatially linked addresses in the same context. Our approach jointly integrates concepts from graph theory and weak supervision with address text and geospatial semantics. This integration enables us to generate informative and diverse address pairs, facilitating pretraining and fine-tuning in a self-supervised manner. Experiments and ablation studies on manually curated datasets and comparisons with state-of-the-art techniques demonstrate the efficacy of our approach. We achieve a 24.49% improvement in recall while maintaining 95% precision on average, in comparison to the current baseline across multiple geographies. Further, we deploy our proposed approach and show the positive impact of improving address matching on geocode learning.</abstract>
<identifier type="citekey">maheshwary-etal-2024-pretraining</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-industry.58</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>763</start>
<end>773</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Pretraining and Finetuning Language Models on Geospatial Networks for Accurate Address Matching
%A Maheshwary, Saket
%A Paul, Arpan
%A Sohoney, Saurabh
%Y Dernoncourt, Franck
%Y Preoţiuc-Pietro, Daniel
%Y Shimorina, Anastasia
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, US
%F maheshwary-etal-2024-pretraining
%X We propose a novel framework for pretraining and fine-tuning language models with the goal of determining whether two addresses represent the same physical building. Address matching and building authoritative address catalogues are important to many applications and businesses, such as delivery services, online retail, emergency services, logistics, etc. We propose to view a collection of addresses as an address graph and curate inputs for language models by placing geospatially linked addresses in the same context. Our approach jointly integrates concepts from graph theory and weak supervision with address text and geospatial semantics. This integration enables us to generate informative and diverse address pairs, facilitating pretraining and fine-tuning in a self-supervised manner. Experiments and ablation studies on manually curated datasets and comparisons with state-of-the-art techniques demonstrate the efficacy of our approach. We achieve a 24.49% improvement in recall while maintaining 95% precision on average, in comparison to the current baseline across multiple geographies. Further, we deploy our proposed approach and show the positive impact of improving address matching on geocode learning.
%U https://aclanthology.org/2024.emnlp-industry.58
%P 763-773
Markdown (Informal)
[Pretraining and Finetuning Language Models on Geospatial Networks for Accurate Address Matching](https://aclanthology.org/2024.emnlp-industry.58) (Maheshwary et al., EMNLP 2024)
ACL