@inproceedings{au-etal-2022-e,
title = "{E}-{NER} {---} An Annotated Named Entity Recognition Corpus of Legal Text",
author = "Au, Ting Wai Terence and
Lampos, Vasileios and
Cox, Ingemar",
editor = "Aletras, Nikolaos and
Chalkidis, Ilias and
Barrett, Leslie and
Goan{\textcommabelow{t}}{\u{a}}, C{\u{a}}t{\u{a}}lina and
Preo{\textcommabelow{t}}iuc-Pietro, Daniel",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2022",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nllp-1.22",
doi = "10.18653/v1/2022.nllp-1.22",
pages = "246--255",
abstract = "Identifying named entities such as a person, location or organization, in documents can highlight key information to readers. Training Named Entity Recognition (NER) models requires an annotated data set, which can be a time-consuming labour-intensive task. Nevertheless, there are publicly available NER data sets for general English. Recently there has been interest in developing NER for legal text. However, prior work and experimental results reported here indicate that there is a significant degradation in performance when NER methods trained on a general English data set are applied to legal text. We describe a publicly available legal NER data set, called E-NER, based on legal company filings available from the US Securities and Exchange Commission{'}s EDGAR data set. Training a number of different NER algorithms on the general English CoNLL-2003 corpus but testing on our test collection confirmed significant degradations in accuracy, as measured by the F1-score, of between 29.4{\%} and 60.4{\%}, compared to training and testing on the E-NER collection.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="au-etal-2022-e">
<titleInfo>
<title>E-NER — An Annotated Named Entity Recognition Corpus of Legal Text</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ting</namePart>
<namePart type="given">Wai</namePart>
<namePart type="given">Terence</namePart>
<namePart type="family">Au</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vasileios</namePart>
<namePart type="family">Lampos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ingemar</namePart>
<namePart type="family">Cox</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Natural Legal Language Processing Workshop 2022</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nikolaos</namePart>
<namePart type="family">Aletras</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ilias</namePart>
<namePart type="family">Chalkidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leslie</namePart>
<namePart type="family">Barrett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cătălina</namePart>
<namePart type="family">Goan\textcommabelowtă</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preo\textcommabelowtiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Identifying named entities such as a person, location or organization, in documents can highlight key information to readers. Training Named Entity Recognition (NER) models requires an annotated data set, which can be a time-consuming labour-intensive task. Nevertheless, there are publicly available NER data sets for general English. Recently there has been interest in developing NER for legal text. However, prior work and experimental results reported here indicate that there is a significant degradation in performance when NER methods trained on a general English data set are applied to legal text. We describe a publicly available legal NER data set, called E-NER, based on legal company filings available from the US Securities and Exchange Commission’s EDGAR data set. Training a number of different NER algorithms on the general English CoNLL-2003 corpus but testing on our test collection confirmed significant degradations in accuracy, as measured by the F1-score, of between 29.4% and 60.4%, compared to training and testing on the E-NER collection.</abstract>
<identifier type="citekey">au-etal-2022-e</identifier>
<identifier type="doi">10.18653/v1/2022.nllp-1.22</identifier>
<location>
<url>https://aclanthology.org/2022.nllp-1.22</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>246</start>
<end>255</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T E-NER — An Annotated Named Entity Recognition Corpus of Legal Text
%A Au, Ting Wai Terence
%A Lampos, Vasileios
%A Cox, Ingemar
%Y Aletras, Nikolaos
%Y Chalkidis, Ilias
%Y Barrett, Leslie
%Y Goan\textcommabelowtă, Cătălina
%Y Preo\textcommabelowtiuc-Pietro, Daniel
%S Proceedings of the Natural Legal Language Processing Workshop 2022
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F au-etal-2022-e
%X Identifying named entities such as a person, location or organization, in documents can highlight key information to readers. Training Named Entity Recognition (NER) models requires an annotated data set, which can be a time-consuming labour-intensive task. Nevertheless, there are publicly available NER data sets for general English. Recently there has been interest in developing NER for legal text. However, prior work and experimental results reported here indicate that there is a significant degradation in performance when NER methods trained on a general English data set are applied to legal text. We describe a publicly available legal NER data set, called E-NER, based on legal company filings available from the US Securities and Exchange Commission’s EDGAR data set. Training a number of different NER algorithms on the general English CoNLL-2003 corpus but testing on our test collection confirmed significant degradations in accuracy, as measured by the F1-score, of between 29.4% and 60.4%, compared to training and testing on the E-NER collection.
%R 10.18653/v1/2022.nllp-1.22
%U https://aclanthology.org/2022.nllp-1.22
%U https://doi.org/10.18653/v1/2022.nllp-1.22
%P 246-255
Markdown (Informal)
[E-NER — An Annotated Named Entity Recognition Corpus of Legal Text](https://aclanthology.org/2022.nllp-1.22) (Au et al., NLLP 2022)
ACL