@inproceedings{sousa-etal-2026-bipa,
title = "{BIPA}: {B}razilian {P}ortuguese Phonetic Dataset with Dialectal Variations in {IPA} Standard",
author = "Sousa, Thiago Monteles de and
Gris, Lucas Rafael and
Silva, N{\'a}dia F{\'e}lix Felipe da",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.47/",
pages = "478--487",
ISBN = "979-8-89176-387-6",
abstract = "This work presents BIPA, a phonetic transcription corpus for Brazilian Portuguese that covers regional dialectal variations. The corpus was constructed through automated extraction from Wiktionary, resulting in 53,353 unique words and 350,021 transcriptions in IPA format, distributed across six dialects: general Brazilian, Rio de Janeiro, S{\~a}o Paulo, South Region, Northeast Region, and Center-West Region. The average density of 6.56 transcriptions per word reflects multiple regionally conditioned phonetic variations. To validate the utility of the corpus, the ByT5-small model was fine-tuned for grapheme-to-phoneme conversion, achieving a Minimum Phoneme Error Rate of 2.66{\%} on the validation set. BIPA addresses the scarcity of computational linguistic resources for Brazilian Portuguese, enabling applications in regional speech synthesis, automatic accent recognition, and computational sociolinguistic analysis."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sousa-etal-2026-bipa">
<titleInfo>
<title>BIPA: Brazilian Portuguese Phonetic Dataset with Dialectal Variations in IPA Standard</title>
</titleInfo>
<name type="personal">
<namePart type="given">Thiago</namePart>
<namePart type="given">Monteles</namePart>
<namePart type="given">de</namePart>
<namePart type="family">Sousa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="given">Rafael</namePart>
<namePart type="family">Gris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nádia</namePart>
<namePart type="given">Félix</namePart>
<namePart type="given">Felipe</namePart>
<namePart type="given">da</namePart>
<namePart type="family">Silva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>This work presents BIPA, a phonetic transcription corpus for Brazilian Portuguese that covers regional dialectal variations. The corpus was constructed through automated extraction from Wiktionary, resulting in 53,353 unique words and 350,021 transcriptions in IPA format, distributed across six dialects: general Brazilian, Rio de Janeiro, São Paulo, South Region, Northeast Region, and Center-West Region. The average density of 6.56 transcriptions per word reflects multiple regionally conditioned phonetic variations. To validate the utility of the corpus, the ByT5-small model was fine-tuned for grapheme-to-phoneme conversion, achieving a Minimum Phoneme Error Rate of 2.66% on the validation set. BIPA addresses the scarcity of computational linguistic resources for Brazilian Portuguese, enabling applications in regional speech synthesis, automatic accent recognition, and computational sociolinguistic analysis.</abstract>
<identifier type="citekey">sousa-etal-2026-bipa</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.47/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>478</start>
<end>487</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BIPA: Brazilian Portuguese Phonetic Dataset with Dialectal Variations in IPA Standard
%A Sousa, Thiago Monteles de
%A Gris, Lucas Rafael
%A Silva, Nádia Félix Felipe da
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F sousa-etal-2026-bipa
%X This work presents BIPA, a phonetic transcription corpus for Brazilian Portuguese that covers regional dialectal variations. The corpus was constructed through automated extraction from Wiktionary, resulting in 53,353 unique words and 350,021 transcriptions in IPA format, distributed across six dialects: general Brazilian, Rio de Janeiro, São Paulo, South Region, Northeast Region, and Center-West Region. The average density of 6.56 transcriptions per word reflects multiple regionally conditioned phonetic variations. To validate the utility of the corpus, the ByT5-small model was fine-tuned for grapheme-to-phoneme conversion, achieving a Minimum Phoneme Error Rate of 2.66% on the validation set. BIPA addresses the scarcity of computational linguistic resources for Brazilian Portuguese, enabling applications in regional speech synthesis, automatic accent recognition, and computational sociolinguistic analysis.
%U https://aclanthology.org/2026.propor-1.47/
%P 478-487
Markdown (Informal)
[BIPA: Brazilian Portuguese Phonetic Dataset with Dialectal Variations in IPA Standard](https://aclanthology.org/2026.propor-1.47/) (Sousa et al., PROPOR 2026)
ACL