@inproceedings{chakraborty-etal-2025-need,
title = "Do We Need Large {VLM}s for Spotting Soccer Actions?",
author = "Chakraborty, Ritabrata and
Chakraborty, Rajatsubhra and
Dasgupta, Avijit and
Chaurasia, Sandeep",
editor = "T.y.s.s, Santosh and
Shimizu, Shuichiro and
Gong, Yifan",
booktitle = "The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.ijcnlp-srw.6/",
doi = "10.18653/v1/2025.ijcnlp-srw.6",
pages = "59--65",
ISBN = "979-8-89176-304-3",
abstract = "Traditional video-based tasks like soccer action spotting rely heavily on visual inputs, often requiring complex and computationally expensive models to process dense video data. We propose a shift from this video-centric approach to a text-based task, making it lightweight and scalable by utilizing Large Language Models (LLMs) instead of Vision-Language Models (VLMs). We posit that expert commentary, which provides rich descriptions and contextual cues contains sufficient information to reliably spot key actions in a match. To demonstrate this, we employ a system of three LLMs acting as judges specializing in outcome, excitement, and tactics for spotting actions in soccer matches. Our experiments show that this language-centric approach performs effectively in detecting critical match events coming close to state-of-the-art video-based spotters while using zero video processing compute and similar amount of time to process the entire match."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chakraborty-etal-2025-need">
<titleInfo>
<title>Do We Need Large VLMs for Spotting Soccer Actions?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ritabrata</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajatsubhra</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avijit</namePart>
<namePart type="family">Dasgupta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandeep</namePart>
<namePart type="family">Chaurasia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.y.s.s</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuichiro</namePart>
<namePart type="family">Shimizu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifan</namePart>
<namePart type="family">Gong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mumbai, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-304-3</identifier>
</relatedItem>
<abstract>Traditional video-based tasks like soccer action spotting rely heavily on visual inputs, often requiring complex and computationally expensive models to process dense video data. We propose a shift from this video-centric approach to a text-based task, making it lightweight and scalable by utilizing Large Language Models (LLMs) instead of Vision-Language Models (VLMs). We posit that expert commentary, which provides rich descriptions and contextual cues contains sufficient information to reliably spot key actions in a match. To demonstrate this, we employ a system of three LLMs acting as judges specializing in outcome, excitement, and tactics for spotting actions in soccer matches. Our experiments show that this language-centric approach performs effectively in detecting critical match events coming close to state-of-the-art video-based spotters while using zero video processing compute and similar amount of time to process the entire match.</abstract>
<identifier type="citekey">chakraborty-etal-2025-need</identifier>
<identifier type="doi">10.18653/v1/2025.ijcnlp-srw.6</identifier>
<location>
<url>https://aclanthology.org/2025.ijcnlp-srw.6/</url>
</location>
<part>
<date>2025-12</date>
<extent unit="page">
<start>59</start>
<end>65</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Do We Need Large VLMs for Spotting Soccer Actions?
%A Chakraborty, Ritabrata
%A Chakraborty, Rajatsubhra
%A Dasgupta, Avijit
%A Chaurasia, Sandeep
%Y T.y.s.s, Santosh
%Y Shimizu, Shuichiro
%Y Gong, Yifan
%S The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics
%D 2025
%8 December
%I Association for Computational Linguistics
%C Mumbai, India
%@ 979-8-89176-304-3
%F chakraborty-etal-2025-need
%X Traditional video-based tasks like soccer action spotting rely heavily on visual inputs, often requiring complex and computationally expensive models to process dense video data. We propose a shift from this video-centric approach to a text-based task, making it lightweight and scalable by utilizing Large Language Models (LLMs) instead of Vision-Language Models (VLMs). We posit that expert commentary, which provides rich descriptions and contextual cues contains sufficient information to reliably spot key actions in a match. To demonstrate this, we employ a system of three LLMs acting as judges specializing in outcome, excitement, and tactics for spotting actions in soccer matches. Our experiments show that this language-centric approach performs effectively in detecting critical match events coming close to state-of-the-art video-based spotters while using zero video processing compute and similar amount of time to process the entire match.
%R 10.18653/v1/2025.ijcnlp-srw.6
%U https://aclanthology.org/2025.ijcnlp-srw.6/
%U https://doi.org/10.18653/v1/2025.ijcnlp-srw.6
%P 59-65
Markdown (Informal)
[Do We Need Large VLMs for Spotting Soccer Actions?](https://aclanthology.org/2025.ijcnlp-srw.6/) (Chakraborty et al., IJCNLP-AACL 2025)
ACL
- Ritabrata Chakraborty, Rajatsubhra Chakraborty, Avijit Dasgupta, and Sandeep Chaurasia. 2025. Do We Need Large VLMs for Spotting Soccer Actions?. In The 14th International Joint Conference on Natural Language Processing and The 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 59–65, Mumbai, India. Association for Computational Linguistics.