@inproceedings{kusa-etal-2022-dossier,
title = "{D}o{SSIER} at {M}ed{V}id{QA} 2022: Text-based Approaches to Medical Video Answer Localization Problem",
author = "Kusa, Wojciech and
Peikos, Georgios and
Espitia, {\'O}scar and
Hanbury, Allan and
Pasi, Gabriella",
editor = "Demner-Fushman, Dina and
Cohen, Kevin Bretonnel and
Ananiadou, Sophia and
Tsujii, Junichi",
booktitle = "Proceedings of the 21st Workshop on Biomedical Language Processing",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.bionlp-1.43",
doi = "10.18653/v1/2022.bionlp-1.43",
pages = "432--440",
abstract = "This paper describes our contribution to the Answer Localization track of the MedVidQA 2022 Shared Task. We propose two answer localization approaches that use only textual information extracted from the video. In particular, our approaches exploit the text extracted from the video{'}s transcripts along with the text displayed in the video{'}s frames to create a set of features. Having created a set of features that represents a video{'}s textual information, we employ four different models to measure the similarity between a video{'}s segment and a corresponding question. Then, we employ two different methods to obtain the start and end times of the identified answer. One of them is based on a random forest regressor, whereas the other one uses an unsupervised peak detection model to detect the answer{'}s start time. Our findings suggest that for this task, leveraging only text-related features (transmitted either verbally or visually) and using a small amount of training data, lead to significant improvements over the benchmark Video Span Localization model that is based on deep neural networks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kusa-etal-2022-dossier">
<titleInfo>
<title>DoSSIER at MedVidQA 2022: Text-based Approaches to Medical Video Answer Localization Problem</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wojciech</namePart>
<namePart type="family">Kusa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georgios</namePart>
<namePart type="family">Peikos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Óscar</namePart>
<namePart type="family">Espitia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Allan</namePart>
<namePart type="family">Hanbury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriella</namePart>
<namePart type="family">Pasi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 21st Workshop on Biomedical Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dina</namePart>
<namePart type="family">Demner-Fushman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kevin</namePart>
<namePart type="given">Bretonnel</namePart>
<namePart type="family">Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sophia</namePart>
<namePart type="family">Ananiadou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Tsujii</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper describes our contribution to the Answer Localization track of the MedVidQA 2022 Shared Task. We propose two answer localization approaches that use only textual information extracted from the video. In particular, our approaches exploit the text extracted from the video’s transcripts along with the text displayed in the video’s frames to create a set of features. Having created a set of features that represents a video’s textual information, we employ four different models to measure the similarity between a video’s segment and a corresponding question. Then, we employ two different methods to obtain the start and end times of the identified answer. One of them is based on a random forest regressor, whereas the other one uses an unsupervised peak detection model to detect the answer’s start time. Our findings suggest that for this task, leveraging only text-related features (transmitted either verbally or visually) and using a small amount of training data, lead to significant improvements over the benchmark Video Span Localization model that is based on deep neural networks.</abstract>
<identifier type="citekey">kusa-etal-2022-dossier</identifier>
<identifier type="doi">10.18653/v1/2022.bionlp-1.43</identifier>
<location>
<url>https://aclanthology.org/2022.bionlp-1.43</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>432</start>
<end>440</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DoSSIER at MedVidQA 2022: Text-based Approaches to Medical Video Answer Localization Problem
%A Kusa, Wojciech
%A Peikos, Georgios
%A Espitia, Óscar
%A Hanbury, Allan
%A Pasi, Gabriella
%Y Demner-Fushman, Dina
%Y Cohen, Kevin Bretonnel
%Y Ananiadou, Sophia
%Y Tsujii, Junichi
%S Proceedings of the 21st Workshop on Biomedical Language Processing
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F kusa-etal-2022-dossier
%X This paper describes our contribution to the Answer Localization track of the MedVidQA 2022 Shared Task. We propose two answer localization approaches that use only textual information extracted from the video. In particular, our approaches exploit the text extracted from the video’s transcripts along with the text displayed in the video’s frames to create a set of features. Having created a set of features that represents a video’s textual information, we employ four different models to measure the similarity between a video’s segment and a corresponding question. Then, we employ two different methods to obtain the start and end times of the identified answer. One of them is based on a random forest regressor, whereas the other one uses an unsupervised peak detection model to detect the answer’s start time. Our findings suggest that for this task, leveraging only text-related features (transmitted either verbally or visually) and using a small amount of training data, lead to significant improvements over the benchmark Video Span Localization model that is based on deep neural networks.
%R 10.18653/v1/2022.bionlp-1.43
%U https://aclanthology.org/2022.bionlp-1.43
%U https://doi.org/10.18653/v1/2022.bionlp-1.43
%P 432-440
Markdown (Informal)
[DoSSIER at MedVidQA 2022: Text-based Approaches to Medical Video Answer Localization Problem](https://aclanthology.org/2022.bionlp-1.43) (Kusa et al., BioNLP 2022)
ACL