@inproceedings{felippo-etal-2026-lexical,
title = "Lexical and Orthographic Variation in {P}ortuguese Financial Tweets: Annotation, Analysis, and Implications for Embedding Models",
author = "Felippo, Ariani Di and
Roman, Norton Trevisan and
Barbosa, Bryan K. S. and
Oliveira, Gabriela Pinheiro de and
Scandarolli, Clarissa Lenina",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.62/",
pages = "628--637",
ISBN = "979-8-89176-387-6",
abstract = "Twitter/X remains a key source of user-generated content, requiring Natural Language Processing tools capable of handling non-canonical language. This study presents a manual annotation of lexical and orthographic phenomena in DANTEStocks, a corpus of financial tweets in Brazilian Portuguese, using a hierarchical typology to capture both creative uses and deviations from the standard norm. Results show that orthographic variation is strongly influenced by creative forms, mainly driven by platform- and domain-specific innovations. Standard norm variation is systematic, mostly involving predictable omissions of diacritics and the cedilla, and most tokens exhibit only one phenomenon, reflecting stable and largely independent patterns of variation in this Twitter subgenre. The identified variant forms enabled the construction of a lexicon for evaluating embedding models. We assessed how BERTimbau, Word2Vec, and FastText handle lexical variation in raw, unnormalized data, showing that the lexicon reduces out-of-vocabulary rates and improves coverage. These results highlight model robustness and the value of curated lexical resources in complementing both fixed and data-driven vocabularies."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="felippo-etal-2026-lexical">
<titleInfo>
<title>Lexical and Orthographic Variation in Portuguese Financial Tweets: Annotation, Analysis, and Implications for Embedding Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ariani</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Felippo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Norton</namePart>
<namePart type="given">Trevisan</namePart>
<namePart type="family">Roman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bryan</namePart>
<namePart type="given">K</namePart>
<namePart type="given">S</namePart>
<namePart type="family">Barbosa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriela</namePart>
<namePart type="given">Pinheiro</namePart>
<namePart type="given">de</namePart>
<namePart type="family">Oliveira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Clarissa</namePart>
<namePart type="given">Lenina</namePart>
<namePart type="family">Scandarolli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Twitter/X remains a key source of user-generated content, requiring Natural Language Processing tools capable of handling non-canonical language. This study presents a manual annotation of lexical and orthographic phenomena in DANTEStocks, a corpus of financial tweets in Brazilian Portuguese, using a hierarchical typology to capture both creative uses and deviations from the standard norm. Results show that orthographic variation is strongly influenced by creative forms, mainly driven by platform- and domain-specific innovations. Standard norm variation is systematic, mostly involving predictable omissions of diacritics and the cedilla, and most tokens exhibit only one phenomenon, reflecting stable and largely independent patterns of variation in this Twitter subgenre. The identified variant forms enabled the construction of a lexicon for evaluating embedding models. We assessed how BERTimbau, Word2Vec, and FastText handle lexical variation in raw, unnormalized data, showing that the lexicon reduces out-of-vocabulary rates and improves coverage. These results highlight model robustness and the value of curated lexical resources in complementing both fixed and data-driven vocabularies.</abstract>
<identifier type="citekey">felippo-etal-2026-lexical</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.62/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>628</start>
<end>637</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lexical and Orthographic Variation in Portuguese Financial Tweets: Annotation, Analysis, and Implications for Embedding Models
%A Felippo, Ariani Di
%A Roman, Norton Trevisan
%A Barbosa, Bryan K. S.
%A Oliveira, Gabriela Pinheiro de
%A Scandarolli, Clarissa Lenina
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F felippo-etal-2026-lexical
%X Twitter/X remains a key source of user-generated content, requiring Natural Language Processing tools capable of handling non-canonical language. This study presents a manual annotation of lexical and orthographic phenomena in DANTEStocks, a corpus of financial tweets in Brazilian Portuguese, using a hierarchical typology to capture both creative uses and deviations from the standard norm. Results show that orthographic variation is strongly influenced by creative forms, mainly driven by platform- and domain-specific innovations. Standard norm variation is systematic, mostly involving predictable omissions of diacritics and the cedilla, and most tokens exhibit only one phenomenon, reflecting stable and largely independent patterns of variation in this Twitter subgenre. The identified variant forms enabled the construction of a lexicon for evaluating embedding models. We assessed how BERTimbau, Word2Vec, and FastText handle lexical variation in raw, unnormalized data, showing that the lexicon reduces out-of-vocabulary rates and improves coverage. These results highlight model robustness and the value of curated lexical resources in complementing both fixed and data-driven vocabularies.
%U https://aclanthology.org/2026.propor-1.62/
%P 628-637
Markdown (Informal)
[Lexical and Orthographic Variation in Portuguese Financial Tweets: Annotation, Analysis, and Implications for Embedding Models](https://aclanthology.org/2026.propor-1.62/) (Felippo et al., PROPOR 2026)
ACL