@inproceedings{xu-etal-2024-effective,
title = "An Effective Span-based Multimodal Named Entity Recognition with Consistent Cross-Modal Alignment",
author = "Xu, Yongxiu and
Xu, Hao and
Huang, Heyan and
Cui, Shiyao and
Tang, Minghao and
Wang, Longzheng and
Xu, Hongbo",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.lrec-main.95",
pages = "1063--1072",
abstract = "With the increasing availability of multimodal content on social media, consisting primarily of text and images, multimodal named entity recognition (MNER) has gained a wide-spread attention. A fundamental challenge of MNER lies in effectively aligning different modalities. However, the majority of current approaches rely on word-based sequence labeling framework and align the image and text at inconsistent semantic levels (whole image-words or regions-words). This misalignment may lead to inferior entity recognition performance. To address this issue, we propose an effective span-based method, named SMNER, which achieves a more consistent multimodal alignment from the perspectives of information-theoretic and cross-modal interaction, respectively. Specifically, we first introduce a cross-modal information bottleneck module for the global-level multimodal alignment (whole image-whole text). This module aims to encourage the semantic distribution of the image to be closer to the semantic distribution of the text, which can enable the filtering out of visual noise. Next, we introduce a cross-modal attention module for the local-level multimodal alignment (regions-spans), which captures the correlations between regions in the image and spans in the text, enabling a more precise alignment of the two modalities. Extensive ex- periments conducted on two benchmark datasets demonstrate that SMNER outperforms the state-of-the-art baselines.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2024-effective">
<titleInfo>
<title>An Effective Span-based Multimodal Named Entity Recognition with Consistent Cross-Modal Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yongxiu</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heyan</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiyao</namePart>
<namePart type="family">Cui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minghao</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Longzheng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongbo</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Min-Yen</namePart>
<namePart type="family">Kan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Veronique</namePart>
<namePart type="family">Hoste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alessandro</namePart>
<namePart type="family">Lenci</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the increasing availability of multimodal content on social media, consisting primarily of text and images, multimodal named entity recognition (MNER) has gained a wide-spread attention. A fundamental challenge of MNER lies in effectively aligning different modalities. However, the majority of current approaches rely on word-based sequence labeling framework and align the image and text at inconsistent semantic levels (whole image-words or regions-words). This misalignment may lead to inferior entity recognition performance. To address this issue, we propose an effective span-based method, named SMNER, which achieves a more consistent multimodal alignment from the perspectives of information-theoretic and cross-modal interaction, respectively. Specifically, we first introduce a cross-modal information bottleneck module for the global-level multimodal alignment (whole image-whole text). This module aims to encourage the semantic distribution of the image to be closer to the semantic distribution of the text, which can enable the filtering out of visual noise. Next, we introduce a cross-modal attention module for the local-level multimodal alignment (regions-spans), which captures the correlations between regions in the image and spans in the text, enabling a more precise alignment of the two modalities. Extensive ex- periments conducted on two benchmark datasets demonstrate that SMNER outperforms the state-of-the-art baselines.</abstract>
<identifier type="citekey">xu-etal-2024-effective</identifier>
<location>
<url>https://aclanthology.org/2024.lrec-main.95</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>1063</start>
<end>1072</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Effective Span-based Multimodal Named Entity Recognition with Consistent Cross-Modal Alignment
%A Xu, Yongxiu
%A Xu, Hao
%A Huang, Heyan
%A Cui, Shiyao
%A Tang, Minghao
%A Wang, Longzheng
%A Xu, Hongbo
%Y Calzolari, Nicoletta
%Y Kan, Min-Yen
%Y Hoste, Veronique
%Y Lenci, Alessandro
%Y Sakti, Sakriani
%Y Xue, Nianwen
%S Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F xu-etal-2024-effective
%X With the increasing availability of multimodal content on social media, consisting primarily of text and images, multimodal named entity recognition (MNER) has gained a wide-spread attention. A fundamental challenge of MNER lies in effectively aligning different modalities. However, the majority of current approaches rely on word-based sequence labeling framework and align the image and text at inconsistent semantic levels (whole image-words or regions-words). This misalignment may lead to inferior entity recognition performance. To address this issue, we propose an effective span-based method, named SMNER, which achieves a more consistent multimodal alignment from the perspectives of information-theoretic and cross-modal interaction, respectively. Specifically, we first introduce a cross-modal information bottleneck module for the global-level multimodal alignment (whole image-whole text). This module aims to encourage the semantic distribution of the image to be closer to the semantic distribution of the text, which can enable the filtering out of visual noise. Next, we introduce a cross-modal attention module for the local-level multimodal alignment (regions-spans), which captures the correlations between regions in the image and spans in the text, enabling a more precise alignment of the two modalities. Extensive ex- periments conducted on two benchmark datasets demonstrate that SMNER outperforms the state-of-the-art baselines.
%U https://aclanthology.org/2024.lrec-main.95
%P 1063-1072
Markdown (Informal)
[An Effective Span-based Multimodal Named Entity Recognition with Consistent Cross-Modal Alignment](https://aclanthology.org/2024.lrec-main.95) (Xu et al., LREC-COLING 2024)
ACL
- Yongxiu Xu, Hao Xu, Heyan Huang, Shiyao Cui, Minghao Tang, Longzheng Wang, and Hongbo Xu. 2024. An Effective Span-based Multimodal Named Entity Recognition with Consistent Cross-Modal Alignment. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 1063–1072, Torino, Italia. ELRA and ICCL.