@inproceedings{arabzadeh-clarke-2024-frechet,
title = "Fr{\'e}chet Distance for Offline Evaluation of Information Retrieval Systems with Sparse Labels",
author = "Arabzadeh, Negar and
Clarke, Charles",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-long.26",
pages = "420--431",
abstract = "The rapid advancement of natural language processing, information retrieval (IR), computer vision, and other technologies has presented significant challenges in evaluating the performance of these systems. One of the main challenges is the scarcity of human-labeled data, which hinders the fair and accurate assessment of these systems. In this work, we specifically focus on evaluating IR systems with sparse labels, borrowing from recent research on evaluating computer vision tasks.taking inspiration from the success of using Fr{\'e}chet Inception Distance (FID) in assessing text-to-image generation systems. We propose leveraging the Fr{\'e}chet Distance to measure the distance between the distributions of relevant judged items and retrieved results. Our experimental results on MS MARCO V1 dataset and TREC Deep Learning Tracks query sets demonstrate the effectiveness of the Fr{\'e}chet Distance as a metric for evaluating IR systems, particularly in settings where a few labels are available.This approach contributes to the advancement of evaluation methodologies in real-world scenarios such as the assessment of generative IR systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="arabzadeh-clarke-2024-frechet">
<titleInfo>
<title>Fréchet Distance for Offline Evaluation of Information Retrieval Systems with Sparse Labels</title>
</titleInfo>
<name type="personal">
<namePart type="given">Negar</namePart>
<namePart type="family">Arabzadeh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charles</namePart>
<namePart type="family">Clarke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The rapid advancement of natural language processing, information retrieval (IR), computer vision, and other technologies has presented significant challenges in evaluating the performance of these systems. One of the main challenges is the scarcity of human-labeled data, which hinders the fair and accurate assessment of these systems. In this work, we specifically focus on evaluating IR systems with sparse labels, borrowing from recent research on evaluating computer vision tasks.taking inspiration from the success of using Fréchet Inception Distance (FID) in assessing text-to-image generation systems. We propose leveraging the Fréchet Distance to measure the distance between the distributions of relevant judged items and retrieved results. Our experimental results on MS MARCO V1 dataset and TREC Deep Learning Tracks query sets demonstrate the effectiveness of the Fréchet Distance as a metric for evaluating IR systems, particularly in settings where a few labels are available.This approach contributes to the advancement of evaluation methodologies in real-world scenarios such as the assessment of generative IR systems.</abstract>
<identifier type="citekey">arabzadeh-clarke-2024-frechet</identifier>
<location>
<url>https://aclanthology.org/2024.eacl-long.26</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>420</start>
<end>431</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Fréchet Distance for Offline Evaluation of Information Retrieval Systems with Sparse Labels
%A Arabzadeh, Negar
%A Clarke, Charles
%Y Graham, Yvette
%Y Purver, Matthew
%S Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F arabzadeh-clarke-2024-frechet
%X The rapid advancement of natural language processing, information retrieval (IR), computer vision, and other technologies has presented significant challenges in evaluating the performance of these systems. One of the main challenges is the scarcity of human-labeled data, which hinders the fair and accurate assessment of these systems. In this work, we specifically focus on evaluating IR systems with sparse labels, borrowing from recent research on evaluating computer vision tasks.taking inspiration from the success of using Fréchet Inception Distance (FID) in assessing text-to-image generation systems. We propose leveraging the Fréchet Distance to measure the distance between the distributions of relevant judged items and retrieved results. Our experimental results on MS MARCO V1 dataset and TREC Deep Learning Tracks query sets demonstrate the effectiveness of the Fréchet Distance as a metric for evaluating IR systems, particularly in settings where a few labels are available.This approach contributes to the advancement of evaluation methodologies in real-world scenarios such as the assessment of generative IR systems.
%U https://aclanthology.org/2024.eacl-long.26
%P 420-431
Markdown (Informal)
[Fréchet Distance for Offline Evaluation of Information Retrieval Systems with Sparse Labels](https://aclanthology.org/2024.eacl-long.26) (Arabzadeh & Clarke, EACL 2024)
ACL