@inproceedings{freitas-etal-2026-towards,
title = "Towards a {U}niversal {D}ependencies Corpus for {P}ortuguese Epidemiological Reports",
author = "Freitas, Christian and
Real, Livy and
Berton, Lilian and
Paiva, Valeria de",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 2",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-2.31/",
pages = "228--237",
ISBN = "979-8-89176-387-6",
abstract = "We present an ongoing research project focused on the construction of a Universal Dependencies (UD) corpus of Portuguese epidemiological reports derived from documents published within the Brazilian public health system. We describe findings and challenges to build such a corpus from PDF reports processed through a controlled document extraction pipeline that contrasts layout-aware extraction with raw PDF text extraction, explicitly addressing the impact of tabular content on downstream syntactic analysis. Narrative text is annotated using multiple UD parsers for Portuguese, including widely used and state-of-the-art tools, and their outputs are systematically compared using descriptive structural indicators and targeted qualitative inspection. Our analysis highlights domain-specific challenges in epidemiological texts and shows that document extraction and representation choices have a stronger effect on parsing behavior than parser selection alone. Based on these findings, we identify robust preprocessing configurations and discuss design choices for a UD-epidemiological corpus to support future research on syntactic parsing, domain adaptation, and downstream natural language processing tasks in epidemiology and public health."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="freitas-etal-2026-towards">
<titleInfo>
<title>Towards a Universal Dependencies Corpus for Portuguese Epidemiological Reports</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Livy</namePart>
<namePart type="family">Real</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lilian</namePart>
<namePart type="family">Berton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valeria</namePart>
<namePart type="given">de</namePart>
<namePart type="family">Paiva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>We present an ongoing research project focused on the construction of a Universal Dependencies (UD) corpus of Portuguese epidemiological reports derived from documents published within the Brazilian public health system. We describe findings and challenges to build such a corpus from PDF reports processed through a controlled document extraction pipeline that contrasts layout-aware extraction with raw PDF text extraction, explicitly addressing the impact of tabular content on downstream syntactic analysis. Narrative text is annotated using multiple UD parsers for Portuguese, including widely used and state-of-the-art tools, and their outputs are systematically compared using descriptive structural indicators and targeted qualitative inspection. Our analysis highlights domain-specific challenges in epidemiological texts and shows that document extraction and representation choices have a stronger effect on parsing behavior than parser selection alone. Based on these findings, we identify robust preprocessing configurations and discuss design choices for a UD-epidemiological corpus to support future research on syntactic parsing, domain adaptation, and downstream natural language processing tasks in epidemiology and public health.</abstract>
<identifier type="citekey">freitas-etal-2026-towards</identifier>
<location>
<url>https://aclanthology.org/2026.propor-2.31/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>228</start>
<end>237</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards a Universal Dependencies Corpus for Portuguese Epidemiological Reports
%A Freitas, Christian
%A Real, Livy
%A Berton, Lilian
%A Paiva, Valeria de
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 2
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F freitas-etal-2026-towards
%X We present an ongoing research project focused on the construction of a Universal Dependencies (UD) corpus of Portuguese epidemiological reports derived from documents published within the Brazilian public health system. We describe findings and challenges to build such a corpus from PDF reports processed through a controlled document extraction pipeline that contrasts layout-aware extraction with raw PDF text extraction, explicitly addressing the impact of tabular content on downstream syntactic analysis. Narrative text is annotated using multiple UD parsers for Portuguese, including widely used and state-of-the-art tools, and their outputs are systematically compared using descriptive structural indicators and targeted qualitative inspection. Our analysis highlights domain-specific challenges in epidemiological texts and shows that document extraction and representation choices have a stronger effect on parsing behavior than parser selection alone. Based on these findings, we identify robust preprocessing configurations and discuss design choices for a UD-epidemiological corpus to support future research on syntactic parsing, domain adaptation, and downstream natural language processing tasks in epidemiology and public health.
%U https://aclanthology.org/2026.propor-2.31/
%P 228-237
Markdown (Informal)
[Towards a Universal Dependencies Corpus for Portuguese Epidemiological Reports](https://aclanthology.org/2026.propor-2.31/) (Freitas et al., PROPOR 2026)
ACL