@inproceedings{gu-etal-2024-tail,
title = "{TAIL}: A Toolkit for Automatic and Realistic Long-Context Large Language Model Evaluation",
author = "Gu, Gefei and
Zhao, Yilun and
Ning, Ruoxi and
Zheng, Yanan and
Cohan, Arman",
editor = "Hernandez Farias, Delia Irazu and
Hope, Tom and
Li, Manling",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-demo.21",
pages = "198--208",
abstract = "As long-context large language models (LLMs) are attracting increasing attention for their ability to handle context windows exceeding 128k tokens, the need for effective evaluation methods for these models becomes critical.Existing evaluation methods, however, fall short: needle-in-a-haystack (NIAH) and its variants are overly simplistic, while creating realistic benchmarks is prohibitively expensive due to extensive human annotation requirements. To bridge this gap, we propose TAIL, an automatic toolkit for creating realistic evaluation benchmarks and assessing the performance of long-context LLMs.With TAIL, users can customize the building of a long-context, document-grounded QA benchmark and obtain visualized performance metrics of evaluated models.TAIL has the advantage of requiring minimal human annotation and generating natural questions based on user-provided long-context documents. We apply TAIL to construct a benchmark encompassing multiple expert domains, such as finance, law, patent, and scientific literature. We then evaluate four state-of-the-art long-context LLMs using this benchmark. Results show that all LLMs experience varyingdegrees of performance degradation as contextlengths increase.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gu-etal-2024-tail">
<titleInfo>
<title>TAIL: A Toolkit for Automatic and Realistic Long-Context Large Language Model Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gefei</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yilun</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruoxi</namePart>
<namePart type="family">Ning</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanan</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arman</namePart>
<namePart type="family">Cohan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Delia</namePart>
<namePart type="given">Irazu</namePart>
<namePart type="family">Hernandez Farias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Hope</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Manling</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>As long-context large language models (LLMs) are attracting increasing attention for their ability to handle context windows exceeding 128k tokens, the need for effective evaluation methods for these models becomes critical.Existing evaluation methods, however, fall short: needle-in-a-haystack (NIAH) and its variants are overly simplistic, while creating realistic benchmarks is prohibitively expensive due to extensive human annotation requirements. To bridge this gap, we propose TAIL, an automatic toolkit for creating realistic evaluation benchmarks and assessing the performance of long-context LLMs.With TAIL, users can customize the building of a long-context, document-grounded QA benchmark and obtain visualized performance metrics of evaluated models.TAIL has the advantage of requiring minimal human annotation and generating natural questions based on user-provided long-context documents. We apply TAIL to construct a benchmark encompassing multiple expert domains, such as finance, law, patent, and scientific literature. We then evaluate four state-of-the-art long-context LLMs using this benchmark. Results show that all LLMs experience varyingdegrees of performance degradation as contextlengths increase.</abstract>
<identifier type="citekey">gu-etal-2024-tail</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-demo.21</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>198</start>
<end>208</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TAIL: A Toolkit for Automatic and Realistic Long-Context Large Language Model Evaluation
%A Gu, Gefei
%A Zhao, Yilun
%A Ning, Ruoxi
%A Zheng, Yanan
%A Cohan, Arman
%Y Hernandez Farias, Delia Irazu
%Y Hope, Tom
%Y Li, Manling
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F gu-etal-2024-tail
%X As long-context large language models (LLMs) are attracting increasing attention for their ability to handle context windows exceeding 128k tokens, the need for effective evaluation methods for these models becomes critical.Existing evaluation methods, however, fall short: needle-in-a-haystack (NIAH) and its variants are overly simplistic, while creating realistic benchmarks is prohibitively expensive due to extensive human annotation requirements. To bridge this gap, we propose TAIL, an automatic toolkit for creating realistic evaluation benchmarks and assessing the performance of long-context LLMs.With TAIL, users can customize the building of a long-context, document-grounded QA benchmark and obtain visualized performance metrics of evaluated models.TAIL has the advantage of requiring minimal human annotation and generating natural questions based on user-provided long-context documents. We apply TAIL to construct a benchmark encompassing multiple expert domains, such as finance, law, patent, and scientific literature. We then evaluate four state-of-the-art long-context LLMs using this benchmark. Results show that all LLMs experience varyingdegrees of performance degradation as contextlengths increase.
%U https://aclanthology.org/2024.emnlp-demo.21
%P 198-208
Markdown (Informal)
[TAIL: A Toolkit for Automatic and Realistic Long-Context Large Language Model Evaluation](https://aclanthology.org/2024.emnlp-demo.21) (Gu et al., EMNLP 2024)
ACL