@inproceedings{sagare-etal-2024-audio-visual,
title = "Audio-visual training for improved grounding in video-text {LLM}s",
author = "Sagare, Shivprasad Rajendra and
S, Hemachandran and
Sarabhai, Kinshuk and
Ullegaddi, Prashant and
Sa, Rajeshkumar",
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.inlg-main.36",
pages = "440--445",
abstract = "Recent advances in multimodal LLMs, have led to several video-text models being proposed for critical video-related tasks. However, most of the previous works support visual input only, essentially muting the audio signal in the video. Few models that support both audio and visual input, are not explicitly trained on audio data. Hence, the effect of audio towards video understanding is largely unexplored. To this end, we propose a model architecture that handles audio-visual inputs explicitly. We train our model with both audio and visual data from a video instruction-tuning dataset. Comparison with vision-only baselines, and other audio-visual models showcase that training on audio data indeed leads to better grounding of responses. For better evaluation of audio-visual models, we also release a human-annotated benchmark dataset, with audio-aware question-answer pairs.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sagare-etal-2024-audio-visual">
<titleInfo>
<title>Audio-visual training for improved grounding in video-text LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shivprasad</namePart>
<namePart type="given">Rajendra</namePart>
<namePart type="family">Sagare</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hemachandran</namePart>
<namePart type="family">S</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kinshuk</namePart>
<namePart type="family">Sarabhai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prashant</namePart>
<namePart type="family">Ullegaddi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajeshkumar</namePart>
<namePart type="family">Sa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Natural Language Generation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saad</namePart>
<namePart type="family">Mahamood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nguyen</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Minh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daphne</namePart>
<namePart type="family">Ippolito</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advances in multimodal LLMs, have led to several video-text models being proposed for critical video-related tasks. However, most of the previous works support visual input only, essentially muting the audio signal in the video. Few models that support both audio and visual input, are not explicitly trained on audio data. Hence, the effect of audio towards video understanding is largely unexplored. To this end, we propose a model architecture that handles audio-visual inputs explicitly. We train our model with both audio and visual data from a video instruction-tuning dataset. Comparison with vision-only baselines, and other audio-visual models showcase that training on audio data indeed leads to better grounding of responses. For better evaluation of audio-visual models, we also release a human-annotated benchmark dataset, with audio-aware question-answer pairs.</abstract>
<identifier type="citekey">sagare-etal-2024-audio-visual</identifier>
<location>
<url>https://aclanthology.org/2024.inlg-main.36</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>440</start>
<end>445</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Audio-visual training for improved grounding in video-text LLMs
%A Sagare, Shivprasad Rajendra
%A S, Hemachandran
%A Sarabhai, Kinshuk
%A Ullegaddi, Prashant
%A Sa, Rajeshkumar
%Y Mahamood, Saad
%Y Minh, Nguyen Le
%Y Ippolito, Daphne
%S Proceedings of the 17th International Natural Language Generation Conference
%D 2024
%8 September
%I Association for Computational Linguistics
%C Tokyo, Japan
%F sagare-etal-2024-audio-visual
%X Recent advances in multimodal LLMs, have led to several video-text models being proposed for critical video-related tasks. However, most of the previous works support visual input only, essentially muting the audio signal in the video. Few models that support both audio and visual input, are not explicitly trained on audio data. Hence, the effect of audio towards video understanding is largely unexplored. To this end, we propose a model architecture that handles audio-visual inputs explicitly. We train our model with both audio and visual data from a video instruction-tuning dataset. Comparison with vision-only baselines, and other audio-visual models showcase that training on audio data indeed leads to better grounding of responses. For better evaluation of audio-visual models, we also release a human-annotated benchmark dataset, with audio-aware question-answer pairs.
%U https://aclanthology.org/2024.inlg-main.36
%P 440-445
Markdown (Informal)
[Audio-visual training for improved grounding in video-text LLMs](https://aclanthology.org/2024.inlg-main.36) (Sagare et al., INLG 2024)
ACL
- Shivprasad Rajendra Sagare, Hemachandran S, Kinshuk Sarabhai, Prashant Ullegaddi, and Rajeshkumar Sa. 2024. Audio-visual training for improved grounding in video-text LLMs. In Proceedings of the 17th International Natural Language Generation Conference, pages 440–445, Tokyo, Japan. Association for Computational Linguistics.