@inproceedings{yang-etal-2024-pyramidinfer, title = "{P}yramid{I}nfer: Pyramid {KV} Cache Compression for High-throughput {LLM} Inference", author = "Yang, Dongjie and Han, Xiaodong and Gao, Yan and Hu, Yao and Zhang, Shilin and Zhao, Hai", editor = "Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek", booktitle = "Findings of the Association for Computational Linguistics: ACL 2024", month = aug, year = "2024", address = "Bangkok, Thailand", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.findings-acl.195/", doi = "10.18653/v1/2024.findings-acl.195", pages = "3258--3270" }