@inproceedings{tamiosso-etal-2026-evaluating,
title = "Evaluating Small Language Models for {E}nglish-to-{P}ortuguese Translation: Impact of Model Scale and Quantization",
author = "Tamiosso, Gustavo Lopes and
Nunes, Rafael Oleques and
Balreira, Dennis Giovani",
editor = "Souza, Marlo and
de-Dios-Flores, Iria and
Santos, Diana and
Freitas, Larissa and
Souza, Jackson Wilke da Cruz and
Ribeiro, Eug{\'e}nio",
booktitle = "Proceedings of the 17th International Conference on Computational Processing of {P}ortuguese ({PROPOR} 2026) - Vol. 1",
month = apr,
year = "2026",
address = "Salvador, Brazil",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.propor-1.94/",
pages = "943--952",
ISBN = "979-8-89176-387-6",
abstract = "Small language models (SLMs) are increasingly adopted for machine translation due to their lower computational and deployment costs, yet a focused and systematic evaluation for English-to-Portuguese remains limited. We benchmarked dozens of SLMs (135M{--}20B parameters) across multiple architectures and quantization schemes (FP16, Q8{\_}0, Q4{\_}K{\_}M) on two datasets: FLORES-101 (Portuguese subset, 1,012 sentences) and the multidomain OPUS-100 dataset ({\textasciitilde}10k sentences). We computed lexical and semantic metrics (BLEU, chrF, and BERTScore) and assessed statistical differences using non-parametric Friedman tests over paired sentence-level scores, followed by Wilcoxon signed-rank post-hoc comparisons with Holm correction. Normality assumptions are evaluated using the Shapiro{--}Wilk test. Our results strongly suggest that 8-bit quantization (Q8{\_}0) preserves semantic quality with negligible average loss, while 4-bit quantization (Q4{\_}K{\_}M) reaches statistical significance in roughly half of model configurations, paired effect sizes (Cliff{'}s {\ensuremath{\delta}}) remain negligible to small in magnitude, with measurable degradation concentrated in lower-capacity models. Model scale exhibits only a weak correlation with translation quality: medium-sized models can match or outperform larger ones depending on model family and pretraining. These findings highlight trade-offs between efficiency and quality and inform the design of practical English{--}to-Portuguese translation pipelines based on SLMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tamiosso-etal-2026-evaluating">
<titleInfo>
<title>Evaluating Small Language Models for English-to-Portuguese Translation: Impact of Model Scale and Quantization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gustavo</namePart>
<namePart type="given">Lopes</namePart>
<namePart type="family">Tamiosso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rafael</namePart>
<namePart type="given">Oleques</namePart>
<namePart type="family">Nunes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dennis</namePart>
<namePart type="given">Giovani</namePart>
<namePart type="family">Balreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-04</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marlo</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iria</namePart>
<namePart type="family">de-Dios-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Diana</namePart>
<namePart type="family">Santos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Larissa</namePart>
<namePart type="family">Freitas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jackson</namePart>
<namePart type="given">Wilke</namePart>
<namePart type="given">da</namePart>
<namePart type="given">Cruz</namePart>
<namePart type="family">Souza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eugénio</namePart>
<namePart type="family">Ribeiro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Salvador, Brazil</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-387-6</identifier>
</relatedItem>
<abstract>Small language models (SLMs) are increasingly adopted for machine translation due to their lower computational and deployment costs, yet a focused and systematic evaluation for English-to-Portuguese remains limited. We benchmarked dozens of SLMs (135M–20B parameters) across multiple architectures and quantization schemes (FP16, Q8_0, Q4_K_M) on two datasets: FLORES-101 (Portuguese subset, 1,012 sentences) and the multidomain OPUS-100 dataset (~10k sentences). We computed lexical and semantic metrics (BLEU, chrF, and BERTScore) and assessed statistical differences using non-parametric Friedman tests over paired sentence-level scores, followed by Wilcoxon signed-rank post-hoc comparisons with Holm correction. Normality assumptions are evaluated using the Shapiro–Wilk test. Our results strongly suggest that 8-bit quantization (Q8_0) preserves semantic quality with negligible average loss, while 4-bit quantization (Q4_K_M) reaches statistical significance in roughly half of model configurations, paired effect sizes (Cliff’s \ensuremathδ) remain negligible to small in magnitude, with measurable degradation concentrated in lower-capacity models. Model scale exhibits only a weak correlation with translation quality: medium-sized models can match or outperform larger ones depending on model family and pretraining. These findings highlight trade-offs between efficiency and quality and inform the design of practical English–to-Portuguese translation pipelines based on SLMs.</abstract>
<identifier type="citekey">tamiosso-etal-2026-evaluating</identifier>
<location>
<url>https://aclanthology.org/2026.propor-1.94/</url>
</location>
<part>
<date>2026-04</date>
<extent unit="page">
<start>943</start>
<end>952</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating Small Language Models for English-to-Portuguese Translation: Impact of Model Scale and Quantization
%A Tamiosso, Gustavo Lopes
%A Nunes, Rafael Oleques
%A Balreira, Dennis Giovani
%Y Souza, Marlo
%Y de-Dios-Flores, Iria
%Y Santos, Diana
%Y Freitas, Larissa
%Y Souza, Jackson Wilke da Cruz
%Y Ribeiro, Eugénio
%S Proceedings of the 17th International Conference on Computational Processing of Portuguese (PROPOR 2026) - Vol. 1
%D 2026
%8 April
%I Association for Computational Linguistics
%C Salvador, Brazil
%@ 979-8-89176-387-6
%F tamiosso-etal-2026-evaluating
%X Small language models (SLMs) are increasingly adopted for machine translation due to their lower computational and deployment costs, yet a focused and systematic evaluation for English-to-Portuguese remains limited. We benchmarked dozens of SLMs (135M–20B parameters) across multiple architectures and quantization schemes (FP16, Q8_0, Q4_K_M) on two datasets: FLORES-101 (Portuguese subset, 1,012 sentences) and the multidomain OPUS-100 dataset (~10k sentences). We computed lexical and semantic metrics (BLEU, chrF, and BERTScore) and assessed statistical differences using non-parametric Friedman tests over paired sentence-level scores, followed by Wilcoxon signed-rank post-hoc comparisons with Holm correction. Normality assumptions are evaluated using the Shapiro–Wilk test. Our results strongly suggest that 8-bit quantization (Q8_0) preserves semantic quality with negligible average loss, while 4-bit quantization (Q4_K_M) reaches statistical significance in roughly half of model configurations, paired effect sizes (Cliff’s \ensuremathδ) remain negligible to small in magnitude, with measurable degradation concentrated in lower-capacity models. Model scale exhibits only a weak correlation with translation quality: medium-sized models can match or outperform larger ones depending on model family and pretraining. These findings highlight trade-offs between efficiency and quality and inform the design of practical English–to-Portuguese translation pipelines based on SLMs.
%U https://aclanthology.org/2026.propor-1.94/
%P 943-952
Markdown (Informal)
[Evaluating Small Language Models for English-to-Portuguese Translation: Impact of Model Scale and Quantization](https://aclanthology.org/2026.propor-1.94/) (Tamiosso et al., PROPOR 2026)
ACL