@inproceedings{ditchfield-ogle-mitkov-2025-comparative,
title = "A Comparative Study of Vision Transformers and Multimodal Language Models for Violence Detection in Videos",
author = "Ditchfield-Ogle, Tomas and
Mitkov, Ruslan",
editor = "Picazo-Izquierdo, Alicia and
Estevanell-Valladares, Ernesto Luis and
Mitkov, Ruslan and
Guillena, Rafael Mu{\~n}oz and
Cerd{\'a}, Ra{\'u}l Garc{\'i}a",
booktitle = "Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.r2lm-1.2/",
pages = "10--20",
abstract = "This project compares methods for de- tecting violent videos, which are crucial for ensuring real-time safety in surveil- lance and digital moderation. It evaluates four approaches: a random forest classi- fier, a transformer model, and two multi- modal vision-language models. The pro- cess involves preprocessing datasets, train- ing models, and assessing accuracy, inter- pretability, scalability, and real-time suit- ability. Results show that traditional meth- ods are simple but less effective. The trans- former model achieved high accuracy, and the multimodal models offered high vio- lence recall with descriptive justifications. The study highlights trade-offs and pro- vides practical insights for the deployment of automated violence detection."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ditchfield-ogle-mitkov-2025-comparative">
<titleInfo>
<title>A Comparative Study of Vision Transformers and Multimodal Language Models for Violence Detection in Videos</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tomas</namePart>
<namePart type="family">Ditchfield-Ogle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alicia</namePart>
<namePart type="family">Picazo-Izquierdo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ernesto</namePart>
<namePart type="given">Luis</namePart>
<namePart type="family">Estevanell-Valladares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="given">Muñoz</namePart>
<namePart type="family">Guillena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raúl</namePart>
<namePart type="given">García</namePart>
<namePart type="family">Cerdá</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This project compares methods for de- tecting violent videos, which are crucial for ensuring real-time safety in surveil- lance and digital moderation. It evaluates four approaches: a random forest classi- fier, a transformer model, and two multi- modal vision-language models. The pro- cess involves preprocessing datasets, train- ing models, and assessing accuracy, inter- pretability, scalability, and real-time suit- ability. Results show that traditional meth- ods are simple but less effective. The trans- former model achieved high accuracy, and the multimodal models offered high vio- lence recall with descriptive justifications. The study highlights trade-offs and pro- vides practical insights for the deployment of automated violence detection.</abstract>
<identifier type="citekey">ditchfield-ogle-mitkov-2025-comparative</identifier>
<location>
<url>https://aclanthology.org/2025.r2lm-1.2/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>10</start>
<end>20</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Comparative Study of Vision Transformers and Multimodal Language Models for Violence Detection in Videos
%A Ditchfield-Ogle, Tomas
%A Mitkov, Ruslan
%Y Picazo-Izquierdo, Alicia
%Y Estevanell-Valladares, Ernesto Luis
%Y Mitkov, Ruslan
%Y Guillena, Rafael Muñoz
%Y Cerdá, Raúl García
%S Proceedings of the First Workshop on Comparative Performance Evaluation: From Rules to Language Models
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F ditchfield-ogle-mitkov-2025-comparative
%X This project compares methods for de- tecting violent videos, which are crucial for ensuring real-time safety in surveil- lance and digital moderation. It evaluates four approaches: a random forest classi- fier, a transformer model, and two multi- modal vision-language models. The pro- cess involves preprocessing datasets, train- ing models, and assessing accuracy, inter- pretability, scalability, and real-time suit- ability. Results show that traditional meth- ods are simple but less effective. The trans- former model achieved high accuracy, and the multimodal models offered high vio- lence recall with descriptive justifications. The study highlights trade-offs and pro- vides practical insights for the deployment of automated violence detection.
%U https://aclanthology.org/2025.r2lm-1.2/
%P 10-20
Markdown (Informal)
[A Comparative Study of Vision Transformers and Multimodal Language Models for Violence Detection in Videos](https://aclanthology.org/2025.r2lm-1.2/) (Ditchfield-Ogle & Mitkov, R2LM 2025)
ACL