@inproceedings{zhang-etal-2026-mmuie,
title = "{MMUIE}: Massive Multi-Domain Universal Information Extraction for Long Documents",
author = "Zhang, Shuyi and
Chen, Zhenbin and
Li, Shuting and
Tu, Kewei and
Jing, Li and
Jia, Zixia and
Zheng, Zilong",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.334/",
pages = "6338--6370",
ISBN = "979-8-89176-386-9",
abstract = "We present **MMUIE**, a large-scale universal dataset for multi-domain, document-level information extraction (IE) from long texts.Existing IE systems predominantly operate at the sentence level or within narrow domains due to annotation constraints.MMUIE addresses this gap by introducing an automated annotation pipeline that integrates traditional knowledge bases with large language models to extract fine-grained entities, aliases, and relation triples across 34 domains.The dataset comprises a weakly-supervised training set and a manually verified test set, featuring 723 entity types and 456 relation types.Empirical evaluations reveal that existing sentence-level IE models and even advanced LLMs underperform on this task, highlighting the need for better domain-aware document-level models.To this end, we develop DocUIE, a universal IE model fine-tuned on MMUIE, which achieves strong generalization and transferability across domains. MMUIE lays the foundation for robust, scalable, and universal information extraction from long-form text in diverse real-world scenarios. All code, data, and models are available in https://github.com/Shuyi-zsy/Massive-Multi-Domain-UIE."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-mmuie">
<titleInfo>
<title>MMUIE: Massive Multi-Domain Universal Information Extraction for Long Documents</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shuyi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhenbin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuting</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kewei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Jing</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zixia</namePart>
<namePart type="family">Jia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zilong</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>We present **MMUIE**, a large-scale universal dataset for multi-domain, document-level information extraction (IE) from long texts.Existing IE systems predominantly operate at the sentence level or within narrow domains due to annotation constraints.MMUIE addresses this gap by introducing an automated annotation pipeline that integrates traditional knowledge bases with large language models to extract fine-grained entities, aliases, and relation triples across 34 domains.The dataset comprises a weakly-supervised training set and a manually verified test set, featuring 723 entity types and 456 relation types.Empirical evaluations reveal that existing sentence-level IE models and even advanced LLMs underperform on this task, highlighting the need for better domain-aware document-level models.To this end, we develop DocUIE, a universal IE model fine-tuned on MMUIE, which achieves strong generalization and transferability across domains. MMUIE lays the foundation for robust, scalable, and universal information extraction from long-form text in diverse real-world scenarios. All code, data, and models are available in https://github.com/Shuyi-zsy/Massive-Multi-Domain-UIE.</abstract>
<identifier type="citekey">zhang-etal-2026-mmuie</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.334/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>6338</start>
<end>6370</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MMUIE: Massive Multi-Domain Universal Information Extraction for Long Documents
%A Zhang, Shuyi
%A Chen, Zhenbin
%A Li, Shuting
%A Tu, Kewei
%A Jing, Li
%A Jia, Zixia
%A Zheng, Zilong
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F zhang-etal-2026-mmuie
%X We present **MMUIE**, a large-scale universal dataset for multi-domain, document-level information extraction (IE) from long texts.Existing IE systems predominantly operate at the sentence level or within narrow domains due to annotation constraints.MMUIE addresses this gap by introducing an automated annotation pipeline that integrates traditional knowledge bases with large language models to extract fine-grained entities, aliases, and relation triples across 34 domains.The dataset comprises a weakly-supervised training set and a manually verified test set, featuring 723 entity types and 456 relation types.Empirical evaluations reveal that existing sentence-level IE models and even advanced LLMs underperform on this task, highlighting the need for better domain-aware document-level models.To this end, we develop DocUIE, a universal IE model fine-tuned on MMUIE, which achieves strong generalization and transferability across domains. MMUIE lays the foundation for robust, scalable, and universal information extraction from long-form text in diverse real-world scenarios. All code, data, and models are available in https://github.com/Shuyi-zsy/Massive-Multi-Domain-UIE.
%U https://aclanthology.org/2026.findings-eacl.334/
%P 6338-6370
Markdown (Informal)
[MMUIE: Massive Multi-Domain Universal Information Extraction for Long Documents](https://aclanthology.org/2026.findings-eacl.334/) (Zhang et al., Findings 2026)
ACL