@inproceedings{li-etal-2023-espvr,
title = "{ESPVR}: Entity Spans Position Visual Regions for Multimodal Named Entity Recognition",
author = "Li, Xiujiao and
Sun, Guanglu and
Liu, Xinyu",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.522",
doi = "10.18653/v1/2023.findings-emnlp.522",
pages = "7785--7794",
abstract = "Multimodal Named Entity Recognition (MNER) uses visual information to improve the performance of text-only Named Entity Recognition (NER). However, existing methods for acquiring local visual information suffer from certain limitations: (1) using an attention-based method to extract visual regions related to the text from visual regions obtained through convolutional architectures (e.g., ResNet), attention is distracted by the entire image, rather than being fully focused on the visual regions most relevant to the text; (2) using an object detection-based (e.g., Mask R-CNN) method to detect visual object regions related to the text, object detection has a limited range of recognition categories. Moreover, the visual regions obtained by object detection may not correspond to the entities in the text. In summary, the goal of these methods is not to extract the most relevant visual regions for the entities in the text. The visual regions obtained by these methods may be redundant or insufficient for the entities in the text. In this paper, we propose an Entity Spans Position Visual Regions (ESPVR) module to obtain the most relevant visual regions corresponding to the entities in the text. Experiments show that our proposed approach can achieve the SOTA on Twitter-2017 and competitive results on Twitter-2015.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="li-etal-2023-espvr">
<titleInfo>
<title>ESPVR: Entity Spans Position Visual Regions for Multimodal Named Entity Recognition</title>
</titleInfo>
<name type="personal">
<namePart type="given">Xiujiao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanglu</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal Named Entity Recognition (MNER) uses visual information to improve the performance of text-only Named Entity Recognition (NER). However, existing methods for acquiring local visual information suffer from certain limitations: (1) using an attention-based method to extract visual regions related to the text from visual regions obtained through convolutional architectures (e.g., ResNet), attention is distracted by the entire image, rather than being fully focused on the visual regions most relevant to the text; (2) using an object detection-based (e.g., Mask R-CNN) method to detect visual object regions related to the text, object detection has a limited range of recognition categories. Moreover, the visual regions obtained by object detection may not correspond to the entities in the text. In summary, the goal of these methods is not to extract the most relevant visual regions for the entities in the text. The visual regions obtained by these methods may be redundant or insufficient for the entities in the text. In this paper, we propose an Entity Spans Position Visual Regions (ESPVR) module to obtain the most relevant visual regions corresponding to the entities in the text. Experiments show that our proposed approach can achieve the SOTA on Twitter-2017 and competitive results on Twitter-2015.</abstract>
<identifier type="citekey">li-etal-2023-espvr</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.522</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.522</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>7785</start>
<end>7794</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ESPVR: Entity Spans Position Visual Regions for Multimodal Named Entity Recognition
%A Li, Xiujiao
%A Sun, Guanglu
%A Liu, Xinyu
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F li-etal-2023-espvr
%X Multimodal Named Entity Recognition (MNER) uses visual information to improve the performance of text-only Named Entity Recognition (NER). However, existing methods for acquiring local visual information suffer from certain limitations: (1) using an attention-based method to extract visual regions related to the text from visual regions obtained through convolutional architectures (e.g., ResNet), attention is distracted by the entire image, rather than being fully focused on the visual regions most relevant to the text; (2) using an object detection-based (e.g., Mask R-CNN) method to detect visual object regions related to the text, object detection has a limited range of recognition categories. Moreover, the visual regions obtained by object detection may not correspond to the entities in the text. In summary, the goal of these methods is not to extract the most relevant visual regions for the entities in the text. The visual regions obtained by these methods may be redundant or insufficient for the entities in the text. In this paper, we propose an Entity Spans Position Visual Regions (ESPVR) module to obtain the most relevant visual regions corresponding to the entities in the text. Experiments show that our proposed approach can achieve the SOTA on Twitter-2017 and competitive results on Twitter-2015.
%R 10.18653/v1/2023.findings-emnlp.522
%U https://aclanthology.org/2023.findings-emnlp.522
%U https://doi.org/10.18653/v1/2023.findings-emnlp.522
%P 7785-7794
Markdown (Informal)
[ESPVR: Entity Spans Position Visual Regions for Multimodal Named Entity Recognition](https://aclanthology.org/2023.findings-emnlp.522) (Li et al., Findings 2023)
ACL