@inproceedings{zilio-finatto-2026-exploring,
title = "Exploring automatic terminology extraction from historical medical data",
author = "Zilio, Leonardo and
Finatto, Maria Jos{\'e} Bocorny",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 2",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-2.37/",
pages = "282--292",
ISBN = "979-8-89176-387-6",
abstract = "This paper analyzes the performance of several terminology extraction methods when confronted with historical specialized texts that do not conform with modern orthographical norms. We tested two extraction methods based on linguistic patterns, four prompt-based generative artificial intelligence (GenAI) models, and one BERT-like model. Some of these models went through fine-tuning for terminology extraction, and one of these is specialized in the extraction of medical terms from documents written in Portuguese. For the GenAI models, we tested four different prompting strategies. As test set, we used chapter fifteen of the second part of the book Aviso `a Gente do Mar sobre a sua Saude [Advice to Sea People about their Health], originally written in French by G. Mauran at the end of the 18th century, and translated and adapted to Portuguese in 1794. The chapter was annotated with terminology, and the evaluation was conducted independently both in terms of f-measure, as well as in terms of pure precision, to observe if the automatic extraction methods could complement the manual token-based annotation. Results show that using automatic extraction methods to complement the manual annotation can improve coverage, and that individual models do not achieve high extraction quality, but, by combining two or more models, a recall of more than 90{\%} could be achieved in the test data."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zilio-finatto-2026-exploring">
<titleInfo>
<title>Exploring automatic terminology extraction from historical medical data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Leonardo</namePart>
<namePart type="family">Zilio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="given">José</namePart>
<namePart type="given">Bocorny</namePart>
<namePart type="family">Finatto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>This paper analyzes the performance of several terminology extraction methods when confronted with historical specialized texts that do not conform with modern orthographical norms. We tested two extraction methods based on linguistic patterns, four prompt-based generative artificial intelligence (GenAI) models, and one BERT-like model. Some of these models went through fine-tuning for terminology extraction, and one of these is specialized in the extraction of medical terms from documents written in Portuguese. For the GenAI models, we tested four different prompting strategies. As test set, we used chapter fifteen of the second part of the book Aviso ‘a Gente do Mar sobre a sua Saude [Advice to Sea People about their Health], originally written in French by G. Mauran at the end of the 18th century, and translated and adapted to Portuguese in 1794. The chapter was annotated with terminology, and the evaluation was conducted independently both in terms of f-measure, as well as in terms of pure precision, to observe if the automatic extraction methods could complement the manual token-based annotation. Results show that using automatic extraction methods to complement the manual annotation can improve coverage, and that individual models do not achieve high extraction quality, but, by combining two or more models, a recall of more than 90% could be achieved in the test data.</abstract>
<identifier type="citekey">zilio-finatto-2026-exploring</identifier>
<location>
<url>https://aclanthology.org/2026.propor-2.37/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>282</start>
<end>292</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring automatic terminology extraction from historical medical data
%A Zilio, Leonardo
%A Finatto, Maria José Bocorny
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F zilio-finatto-2026-exploring
%X This paper analyzes the performance of several terminology extraction methods when confronted with historical specialized texts that do not conform with modern orthographical norms. We tested two extraction methods based on linguistic patterns, four prompt-based generative artificial intelligence (GenAI) models, and one BERT-like model. Some of these models went through fine-tuning for terminology extraction, and one of these is specialized in the extraction of medical terms from documents written in Portuguese. For the GenAI models, we tested four different prompting strategies. As test set, we used chapter fifteen of the second part of the book Aviso ‘a Gente do Mar sobre a sua Saude [Advice to Sea People about their Health], originally written in French by G. Mauran at the end of the 18th century, and translated and adapted to Portuguese in 1794. The chapter was annotated with terminology, and the evaluation was conducted independently both in terms of f-measure, as well as in terms of pure precision, to observe if the automatic extraction methods could complement the manual token-based annotation. Results show that using automatic extraction methods to complement the manual annotation can improve coverage, and that individual models do not achieve high extraction quality, but, by combining two or more models, a recall of more than 90% could be achieved in the test data.
%U https://aclanthology.org/2026.propor-2.37/
%P 282-292
Markdown (Informal)
[Exploring automatic terminology extraction from historical medical data](https://aclanthology.org/2026.propor-2.37/) (Zilio & Finatto, PROPOR 2026)
ACL