@inproceedings{farokh-etal-2025-diversity,
title = "Diversity is the Key: Enhancing {LLM}-based Post-processing for Automated Audio Captioning",
author = "Farokh, Seyed Ali and
Homayounpour, Mohammad Mehdi and
Nickabadi, Ahmad",
editor = "Chang, Kai-Wei and
Lu, Ke-Han and
Yang, Chih-Kai and
Tam, Zhi-Rui and
Chang, Wen-Yu and
Wang, Chung-Che",
booktitle = "Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)",
month = nov,
year = "2025",
address = "National Taiwan University, Taipei City, Taiwan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.rocling-main.10/",
pages = "87--94",
ISBN = "979-8-89176-379-1",
abstract = "Automated Audio Captioning (AAC) is a multimodal task aimed at generating natural language descriptions of audio content. Previous studies have shown that LLMs can improve AAC performance by summarizing audio events based on a list of candidate captions, which are selected by an external reranker from those generated using Nucleus Sampling. However, the reranking process often selects overly similar captions, disregarding the original diversity of the sampled captions. In this work, we show that this diversity reflects the AAC model{'}s level of certainty and propose a lightweight candidate selection approach that preserves the initial diversity of the generated captions. This, in turn, enables an LLM to summarize the captions while considering the AAC model{'}s certainty in a few-shot setting. Experimental results demonstrate that our method outperforms previous post-processing techniques while being significantly faster."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="farokh-etal-2025-diversity">
<titleInfo>
<title>Diversity is the Key: Enhancing LLM-based Post-processing for Automated Audio Captioning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seyed</namePart>
<namePart type="given">Ali</namePart>
<namePart type="family">Farokh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Mehdi</namePart>
<namePart type="family">Homayounpour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmad</namePart>
<namePart type="family">Nickabadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kai-Wei</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ke-Han</namePart>
<namePart type="family">Lu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chih-Kai</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhi-Rui</namePart>
<namePart type="family">Tam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wen-Yu</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chung-Che</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">National Taiwan University, Taipei City, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-379-1</identifier>
</relatedItem>
<abstract>Automated Audio Captioning (AAC) is a multimodal task aimed at generating natural language descriptions of audio content. Previous studies have shown that LLMs can improve AAC performance by summarizing audio events based on a list of candidate captions, which are selected by an external reranker from those generated using Nucleus Sampling. However, the reranking process often selects overly similar captions, disregarding the original diversity of the sampled captions. In this work, we show that this diversity reflects the AAC model’s level of certainty and propose a lightweight candidate selection approach that preserves the initial diversity of the generated captions. This, in turn, enables an LLM to summarize the captions while considering the AAC model’s certainty in a few-shot setting. Experimental results demonstrate that our method outperforms previous post-processing techniques while being significantly faster.</abstract>
<identifier type="citekey">farokh-etal-2025-diversity</identifier>
<location>
<url>https://aclanthology.org/2025.rocling-main.10/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>87</start>
<end>94</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Diversity is the Key: Enhancing LLM-based Post-processing for Automated Audio Captioning
%A Farokh, Seyed Ali
%A Homayounpour, Mohammad Mehdi
%A Nickabadi, Ahmad
%Y Chang, Kai-Wei
%Y Lu, Ke-Han
%Y Yang, Chih-Kai
%Y Tam, Zhi-Rui
%Y Chang, Wen-Yu
%Y Wang, Chung-Che
%S Proceedings of the 37th Conference on Computational Linguistics and Speech Processing (ROCLING 2025)
%D 2025
%8 November
%I Association for Computational Linguistics
%C National Taiwan University, Taipei City, Taiwan
%@ 979-8-89176-379-1
%F farokh-etal-2025-diversity
%X Automated Audio Captioning (AAC) is a multimodal task aimed at generating natural language descriptions of audio content. Previous studies have shown that LLMs can improve AAC performance by summarizing audio events based on a list of candidate captions, which are selected by an external reranker from those generated using Nucleus Sampling. However, the reranking process often selects overly similar captions, disregarding the original diversity of the sampled captions. In this work, we show that this diversity reflects the AAC model’s level of certainty and propose a lightweight candidate selection approach that preserves the initial diversity of the generated captions. This, in turn, enables an LLM to summarize the captions while considering the AAC model’s certainty in a few-shot setting. Experimental results demonstrate that our method outperforms previous post-processing techniques while being significantly faster.
%U https://aclanthology.org/2025.rocling-main.10/
%P 87-94
Markdown (Informal)
[Diversity is the Key: Enhancing LLM-based Post-processing for Automated Audio Captioning](https://aclanthology.org/2025.rocling-main.10/) (Farokh et al., ROCLING 2025)
ACL