@inproceedings{yaich-hernandez-2025-improving,
title = "Improving Accessibility of {SCOTUS} Opinions: A Benchmark Study and a New Dataset for Generic Heading Prediction and Specific Heading Generation",
author = "Yaich, Malek and
Hernandez, Nicolas",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.456/",
pages = "6827--6839",
abstract = "The opinions of the U.S. Supreme Court (SCOTUS) are known for their extensive length, complex legal language, and lack of titled sections, which pose significant challenges for accessibility and comprehension. This paper defines the task of automatic section titling by proposing both generic and specific headings for each section. Given the scarcity of sections with headings in SCOTUS, we study the possibility of using data from lower courts for training models. A dataset of sections with generic or specific headings covering three courts (SCOTUS and two lower courts) was compiled. A supplementary SCOTUS set was manually annotated with these two types of titles. In order to establish a benchmark, we provide the performance of different systems trained for each subtask: For generic heading prediction, we compare the performance of fine-tuning non-contextual, general and domain-oriented pretrained language models. Transformer-based sequence-to-sequence models are considered for specific heading generation. Our results show that a fine-tuned LegalBERT can achieve a F1 score of about 0.90 {\%} in predicting generic headings. They also show that BART and T5 have similar performance in generating specific headings and that, although this performance is good, there is still room for improvement. In addition, we provide a human assessment to support the generation experiment and show a quasi-linear correlation between human degrees of agreement and the results of conventional measures such as ROUGE and BERTScore."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yaich-hernandez-2025-improving">
<titleInfo>
<title>Improving Accessibility of SCOTUS Opinions: A Benchmark Study and a New Dataset for Generic Heading Prediction and Specific Heading Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Malek</namePart>
<namePart type="family">Yaich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nicolas</namePart>
<namePart type="family">Hernandez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The opinions of the U.S. Supreme Court (SCOTUS) are known for their extensive length, complex legal language, and lack of titled sections, which pose significant challenges for accessibility and comprehension. This paper defines the task of automatic section titling by proposing both generic and specific headings for each section. Given the scarcity of sections with headings in SCOTUS, we study the possibility of using data from lower courts for training models. A dataset of sections with generic or specific headings covering three courts (SCOTUS and two lower courts) was compiled. A supplementary SCOTUS set was manually annotated with these two types of titles. In order to establish a benchmark, we provide the performance of different systems trained for each subtask: For generic heading prediction, we compare the performance of fine-tuning non-contextual, general and domain-oriented pretrained language models. Transformer-based sequence-to-sequence models are considered for specific heading generation. Our results show that a fine-tuned LegalBERT can achieve a F1 score of about 0.90 % in predicting generic headings. They also show that BART and T5 have similar performance in generating specific headings and that, although this performance is good, there is still room for improvement. In addition, we provide a human assessment to support the generation experiment and show a quasi-linear correlation between human degrees of agreement and the results of conventional measures such as ROUGE and BERTScore.</abstract>
<identifier type="citekey">yaich-hernandez-2025-improving</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.456/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6827</start>
<end>6839</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Accessibility of SCOTUS Opinions: A Benchmark Study and a New Dataset for Generic Heading Prediction and Specific Heading Generation
%A Yaich, Malek
%A Hernandez, Nicolas
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F yaich-hernandez-2025-improving
%X The opinions of the U.S. Supreme Court (SCOTUS) are known for their extensive length, complex legal language, and lack of titled sections, which pose significant challenges for accessibility and comprehension. This paper defines the task of automatic section titling by proposing both generic and specific headings for each section. Given the scarcity of sections with headings in SCOTUS, we study the possibility of using data from lower courts for training models. A dataset of sections with generic or specific headings covering three courts (SCOTUS and two lower courts) was compiled. A supplementary SCOTUS set was manually annotated with these two types of titles. In order to establish a benchmark, we provide the performance of different systems trained for each subtask: For generic heading prediction, we compare the performance of fine-tuning non-contextual, general and domain-oriented pretrained language models. Transformer-based sequence-to-sequence models are considered for specific heading generation. Our results show that a fine-tuned LegalBERT can achieve a F1 score of about 0.90 % in predicting generic headings. They also show that BART and T5 have similar performance in generating specific headings and that, although this performance is good, there is still room for improvement. In addition, we provide a human assessment to support the generation experiment and show a quasi-linear correlation between human degrees of agreement and the results of conventional measures such as ROUGE and BERTScore.
%U https://aclanthology.org/2025.coling-main.456/
%P 6827-6839
Markdown (Informal)
[Improving Accessibility of SCOTUS Opinions: A Benchmark Study and a New Dataset for Generic Heading Prediction and Specific Heading Generation](https://aclanthology.org/2025.coling-main.456/) (Yaich & Hernandez, COLING 2025)
ACL