@inproceedings{zafar-etal-2025-courtroom,
title = "From Courtroom to Corpora: Building a Name Entity Corpus for {U}rdu Legal Texts",
author = "Zafar, Adeel and
Ashraf, Sohail and
Nowaczyk, Slawomir",
editor = "Angelova, Galia and
Kunilovskaya, Maria and
Escribe, Marie and
Mitkov, Ruslan",
booktitle = "Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://aclanthology.org/2025.ranlp-1.161/",
pages = "1396--1405",
abstract = "This study explores the effectiveness of transformer-based models for Named Entity Recognition (NER) in Urdu legal documents, a critical task in low-resource language processing. Given the legal texts' specialized terminology and complex syntax, accurate entity recognition in Urdu remains challenging. We developed a legal Urdu dataset that contains 117,500 documents, generated synthetically from 47 different types of legal documents, and evaluated three BERT-based models. XLMRoBERTa, mBERT, and DistilBERT by analyzing their performance on an annotated Urdu legal dataset. mBERT demonstrated superior accuracy (0.999), and its F1 score (0.975) outperforms XLMRoBERTa and DistilBERT, highlighting its robustness in recognizing entities within low-resource languages. To ensure the privacy of the personal identifiers, all documents are anonymized. The dataset for this study is publicly hosted on Hugging Face and will be made public after the publication."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zafar-etal-2025-courtroom">
<titleInfo>
<title>From Courtroom to Corpora: Building a Name Entity Corpus for Urdu Legal Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Adeel</namePart>
<namePart type="family">Zafar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sohail</namePart>
<namePart type="family">Ashraf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slawomir</namePart>
<namePart type="family">Nowaczyk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era</title>
</titleInfo>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Kunilovskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marie</namePart>
<namePart type="family">Escribe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This study explores the effectiveness of transformer-based models for Named Entity Recognition (NER) in Urdu legal documents, a critical task in low-resource language processing. Given the legal texts’ specialized terminology and complex syntax, accurate entity recognition in Urdu remains challenging. We developed a legal Urdu dataset that contains 117,500 documents, generated synthetically from 47 different types of legal documents, and evaluated three BERT-based models. XLMRoBERTa, mBERT, and DistilBERT by analyzing their performance on an annotated Urdu legal dataset. mBERT demonstrated superior accuracy (0.999), and its F1 score (0.975) outperforms XLMRoBERTa and DistilBERT, highlighting its robustness in recognizing entities within low-resource languages. To ensure the privacy of the personal identifiers, all documents are anonymized. The dataset for this study is publicly hosted on Hugging Face and will be made public after the publication.</abstract>
<identifier type="citekey">zafar-etal-2025-courtroom</identifier>
<location>
<url>https://aclanthology.org/2025.ranlp-1.161/</url>
</location>
<part>
<date>2025-09</date>
<extent unit="page">
<start>1396</start>
<end>1405</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T From Courtroom to Corpora: Building a Name Entity Corpus for Urdu Legal Texts
%A Zafar, Adeel
%A Ashraf, Sohail
%A Nowaczyk, Slawomir
%Y Angelova, Galia
%Y Kunilovskaya, Maria
%Y Escribe, Marie
%Y Mitkov, Ruslan
%S Proceedings of the 15th International Conference on Recent Advances in Natural Language Processing - Natural Language Processing in the Generative AI Era
%D 2025
%8 September
%I INCOMA Ltd., Shoumen, Bulgaria
%C Varna, Bulgaria
%F zafar-etal-2025-courtroom
%X This study explores the effectiveness of transformer-based models for Named Entity Recognition (NER) in Urdu legal documents, a critical task in low-resource language processing. Given the legal texts’ specialized terminology and complex syntax, accurate entity recognition in Urdu remains challenging. We developed a legal Urdu dataset that contains 117,500 documents, generated synthetically from 47 different types of legal documents, and evaluated three BERT-based models. XLMRoBERTa, mBERT, and DistilBERT by analyzing their performance on an annotated Urdu legal dataset. mBERT demonstrated superior accuracy (0.999), and its F1 score (0.975) outperforms XLMRoBERTa and DistilBERT, highlighting its robustness in recognizing entities within low-resource languages. To ensure the privacy of the personal identifiers, all documents are anonymized. The dataset for this study is publicly hosted on Hugging Face and will be made public after the publication.
%U https://aclanthology.org/2025.ranlp-1.161/
%P 1396-1405
Markdown (Informal)
[From Courtroom to Corpora: Building a Name Entity Corpus for Urdu Legal Texts](https://aclanthology.org/2025.ranlp-1.161/) (Zafar et al., RANLP 2025)
ACL