@inproceedings{yu-etal-2024-freeeval,
title = "{F}ree{E}val: A Modular Framework for Trustworthy and Efficient Evaluation of Large Language Models",
author = "Yu, Zhuohao and
Gao, Chang and
Yao, Wenjin and
Wang, Yidong and
Zeng, Zhengran and
Ye, Wei and
Wang, Jindong and
Zhang, Yue and
Zhang, Shikun",
editor = "Hernandez Farias, Delia Irazu and
Hope, Tom and
Li, Manling",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.1",
pages = "1--13",
abstract = "The rapid growth of evaluation methodologies and datasets for large language models (LLMs) has created a pressing need for their unified integration. Meanwhile, concerns about data contamination and bias compromise the trustworthiness of evaluation findings, while the efficiency of evaluation processes remains a bottleneck due to the significant computational costs associated with LLM inference.In response to these challenges, we introduce FreeEval, a modular framework not only for conducting trustworthy and efficient automatic evaluations of LLMs but also serving as a platform to develop and validate new evaluation methodologies. FreeEval addresses key challenges through: (1) unified abstractions that simplify the integration of diverse evaluation methods, including dynamic evaluations requiring complex LLM interactions; (2) built-in meta-evaluation techniques such as data contamination detection and human evaluation to enhance result fairness; (3) a high-performance infrastructure with distributed computation and caching strategies for efficient large-scale evaluations; and (4) an interactive Visualizer for result analysis and interpretation to support innovation of evaluation techniques. We open-source all our code at https://github.com/WisdomShell/FreeEval and our demostration video, live demo, installation guides are available at: https://freeeval.zhuohao.me/.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2024-freeeval">
<titleInfo>
<title>FreeEval: A Modular Framework for Trustworthy and Efficient Evaluation of Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhuohao</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjin</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yidong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhengran</namePart>
<namePart type="family">Zeng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Ye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shikun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Delia</namePart>
<namePart type="given">Irazu</namePart>
<namePart type="family">Hernandez Farias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Hope</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The rapid growth of evaluation methodologies and datasets for large language models (LLMs) has created a pressing need for their unified integration. Meanwhile, concerns about data contamination and bias compromise the trustworthiness of evaluation findings, while the efficiency of evaluation processes remains a bottleneck due to the significant computational costs associated with LLM inference.In response to these challenges, we introduce FreeEval, a modular framework not only for conducting trustworthy and efficient automatic evaluations of LLMs but also serving as a platform to develop and validate new evaluation methodologies. FreeEval addresses key challenges through: (1) unified abstractions that simplify the integration of diverse evaluation methods, including dynamic evaluations requiring complex LLM interactions; (2) built-in meta-evaluation techniques such as data contamination detection and human evaluation to enhance result fairness; (3) a high-performance infrastructure with distributed computation and caching strategies for efficient large-scale evaluations; and (4) an interactive Visualizer for result analysis and interpretation to support innovation of evaluation techniques. We open-source all our code at https://github.com/WisdomShell/FreeEval and our demostration video, live demo, installation guides are available at: https://freeeval.zhuohao.me/.</abstract>
<identifier type="citekey">yu-etal-2024-freeeval</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-demo.1</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T FreeEval: A Modular Framework for Trustworthy and Efficient Evaluation of Large Language Models
%A Yu, Zhuohao
%A Gao, Chang
%A Yao, Wenjin
%A Wang, Yidong
%A Zeng, Zhengran
%A Ye, Wei
%A Wang, Jindong
%A Zhang, Yue
%A Zhang, Shikun
%Y Hernandez Farias, Delia Irazu
%Y Hope, Tom
%Y Li, Manling
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F yu-etal-2024-freeeval
%X The rapid growth of evaluation methodologies and datasets for large language models (LLMs) has created a pressing need for their unified integration. Meanwhile, concerns about data contamination and bias compromise the trustworthiness of evaluation findings, while the efficiency of evaluation processes remains a bottleneck due to the significant computational costs associated with LLM inference.In response to these challenges, we introduce FreeEval, a modular framework not only for conducting trustworthy and efficient automatic evaluations of LLMs but also serving as a platform to develop and validate new evaluation methodologies. FreeEval addresses key challenges through: (1) unified abstractions that simplify the integration of diverse evaluation methods, including dynamic evaluations requiring complex LLM interactions; (2) built-in meta-evaluation techniques such as data contamination detection and human evaluation to enhance result fairness; (3) a high-performance infrastructure with distributed computation and caching strategies for efficient large-scale evaluations; and (4) an interactive Visualizer for result analysis and interpretation to support innovation of evaluation techniques. We open-source all our code at https://github.com/WisdomShell/FreeEval and our demostration video, live demo, installation guides are available at: https://freeeval.zhuohao.me/.
%U https://aclanthology.org/2024.emnlp-demo.1
%P 1-13
Markdown (Informal)
[FreeEval: A Modular Framework for Trustworthy and Efficient Evaluation of Large Language Models](https://aclanthology.org/2024.emnlp-demo.1) (Yu et al., EMNLP 2024)
ACL
- Zhuohao Yu, Chang Gao, Wenjin Yao, Yidong Wang, Zhengran Zeng, Wei Ye, Jindong Wang, Yue Zhang, and Shikun Zhang. 2024. FreeEval: A Modular Framework for Trustworthy and Efficient Evaluation of Large Language Models. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 1–13, Miami, Florida, USA. Association for Computational Linguistics.