@inproceedings{simoes-etal-2026-field,
title = "Field of Science and Technology Classification of Academic Documents in {P}ortuguese",
author = "Sim{\~o}es, Ivo and
Oliveira, Hugo Gon{\c{c}}alo and
Correia, Jo{\~a}o",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.104/",
pages = "1021--1026",
ISBN = "979-8-89176-387-6",
abstract = "Towards improving metadata in academic repositories, this study evaluates the efficacy of different transformer-based models in the automatic classification of the Field of Science and Technology (FOS) of academic theses written in Portuguese. We compare the performance of four different encoder models, two multilingual and two Portuguese-specific, against five larger decoder-based LLMs, on a dataset of 9,696 theses characterized by their title, keywords, and abstract. Fine-tuned encoder-based models achieved the best scores (F1 = 88{\%}), outperforming general-purpose decoder models prompted for the task. These results suggest that, for localized academic domains, task-specific fine-tuning remains more effective than general-purpose LLM prompting."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="simoes-etal-2026-field">
<titleInfo>
<title>Field of Science and Technology Classification of Academic Documents in Portuguese</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ivo</namePart>
<namePart type="family">Simões</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hugo</namePart>
<namePart type="given">Gonçalo</namePart>
<namePart type="family">Oliveira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Correia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Towards improving metadata in academic repositories, this study evaluates the efficacy of different transformer-based models in the automatic classification of the Field of Science and Technology (FOS) of academic theses written in Portuguese. We compare the performance of four different encoder models, two multilingual and two Portuguese-specific, against five larger decoder-based LLMs, on a dataset of 9,696 theses characterized by their title, keywords, and abstract. Fine-tuned encoder-based models achieved the best scores (F1 = 88%), outperforming general-purpose decoder models prompted for the task. These results suggest that, for localized academic domains, task-specific fine-tuning remains more effective than general-purpose LLM prompting.</abstract>
<identifier type="citekey">simoes-etal-2026-field</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.104/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>1021</start>
<end>1026</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Field of Science and Technology Classification of Academic Documents in Portuguese
%A Simões, Ivo
%A Oliveira, Hugo Gonçalo
%A Correia, João
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F simoes-etal-2026-field
%X Towards improving metadata in academic repositories, this study evaluates the efficacy of different transformer-based models in the automatic classification of the Field of Science and Technology (FOS) of academic theses written in Portuguese. We compare the performance of four different encoder models, two multilingual and two Portuguese-specific, against five larger decoder-based LLMs, on a dataset of 9,696 theses characterized by their title, keywords, and abstract. Fine-tuned encoder-based models achieved the best scores (F1 = 88%), outperforming general-purpose decoder models prompted for the task. These results suggest that, for localized academic domains, task-specific fine-tuning remains more effective than general-purpose LLM prompting.
%U https://aclanthology.org/2026.propor-1.104/
%P 1021-1026
Markdown (Informal)
[Field of Science and Technology Classification of Academic Documents in Portuguese](https://aclanthology.org/2026.propor-1.104/) (Simões et al., PROPOR 2026)
ACL