@inproceedings{xu-etal-2025-enhancing,
title = "Enhancing Multimodal Named Entity Recognition through Adaptive Mixup Image Augmentation",
author = "Xu, Bo and
Jiang, Haiqi and
Wei, Jie and
Jing, Hongyu and
Du, Ming and
Song, Hui and
Wang, Hongya and
Xiao, Yanghua",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.122/",
pages = "1802--1812",
abstract = "Multimodal named entity recognition (MNER) extends traditional named entity recognition (NER) by integrating visual and textual information. However, current methods still face significant challenges due to the text-image mismatch problem. Recent advancements in text-to-image synthesis provide promising solutions, as synthesized images can introduce additional visual context to enhance MNER model performance. To fully leverage the benefits of both original and synthesized images, we propose an adaptive mixup image augmentation method. This method generates augmented images by determining the mixing ratio based on the matching score between the text and image, utilizing a triplet loss-based Gaussian Mixture Model (TL-GMM). Our approach is highly adaptable and can be seamlessly integrated into existing MNER models. Extensive experiments demonstrate consistent performance improvements, and detailed ablation studies and case studies confirm the effectiveness of our method."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="xu-etal-2025-enhancing">
<titleInfo>
<title>Enhancing Multimodal Named Entity Recognition through Adaptive Mixup Image Augmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haiqi</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jie</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongyu</namePart>
<namePart type="family">Jing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ming</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hui</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongya</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanghua</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multimodal named entity recognition (MNER) extends traditional named entity recognition (NER) by integrating visual and textual information. However, current methods still face significant challenges due to the text-image mismatch problem. Recent advancements in text-to-image synthesis provide promising solutions, as synthesized images can introduce additional visual context to enhance MNER model performance. To fully leverage the benefits of both original and synthesized images, we propose an adaptive mixup image augmentation method. This method generates augmented images by determining the mixing ratio based on the matching score between the text and image, utilizing a triplet loss-based Gaussian Mixture Model (TL-GMM). Our approach is highly adaptable and can be seamlessly integrated into existing MNER models. Extensive experiments demonstrate consistent performance improvements, and detailed ablation studies and case studies confirm the effectiveness of our method.</abstract>
<identifier type="citekey">xu-etal-2025-enhancing</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.122/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>1802</start>
<end>1812</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Enhancing Multimodal Named Entity Recognition through Adaptive Mixup Image Augmentation
%A Xu, Bo
%A Jiang, Haiqi
%A Wei, Jie
%A Jing, Hongyu
%A Du, Ming
%A Song, Hui
%A Wang, Hongya
%A Xiao, Yanghua
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F xu-etal-2025-enhancing
%X Multimodal named entity recognition (MNER) extends traditional named entity recognition (NER) by integrating visual and textual information. However, current methods still face significant challenges due to the text-image mismatch problem. Recent advancements in text-to-image synthesis provide promising solutions, as synthesized images can introduce additional visual context to enhance MNER model performance. To fully leverage the benefits of both original and synthesized images, we propose an adaptive mixup image augmentation method. This method generates augmented images by determining the mixing ratio based on the matching score between the text and image, utilizing a triplet loss-based Gaussian Mixture Model (TL-GMM). Our approach is highly adaptable and can be seamlessly integrated into existing MNER models. Extensive experiments demonstrate consistent performance improvements, and detailed ablation studies and case studies confirm the effectiveness of our method.
%U https://aclanthology.org/2025.coling-main.122/
%P 1802-1812
Markdown (Informal)
[Enhancing Multimodal Named Entity Recognition through Adaptive Mixup Image Augmentation](https://aclanthology.org/2025.coling-main.122/) (Xu et al., COLING 2025)
ACL
- Bo Xu, Haiqi Jiang, Jie Wei, Hongyu Jing, Ming Du, Hui Song, Hongya Wang, and Yanghua Xiao. 2025. Enhancing Multimodal Named Entity Recognition through Adaptive Mixup Image Augmentation. In Proceedings of the 31st International Conference on Computational Linguistics, pages 1802–1812, Abu Dhabi, UAE. Association for Computational Linguistics.