@inproceedings{toutou-etal-2026-data,
title = "Data Contamination in Neural Hieroglyphic Translation: A Reproducibility Study",
author = "Toutou, Ammar and
Harb, Abdelrahman and
Basta, Christine",
editor = {Hamilton, Sil and
{\"O}hman, Emily and
Hicke, Rebecca M. M. and
Bizzoni, Yuri and
Bax, Axel and
Matthews, Jacob A. and
H{\"a}m{\"a}l{\"a}inen, Mika},
booktitle = "Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.nlp4dh-1.6/",
pages = "50--57",
ISBN = "979-8-89176-427-9",
abstract = "Ancient and endangered languages pose a unique challenge for NLP: their datasets are inherently scarce, difficult to expand, and built from formulaic corpora{---}making data-quality issues especially consequential yet rarely audited. Motivated by the need to understand what current NMT can realistically achieve for such languages, we investigate hieroglyphic-to-German translation, where a recent study reported 61.5 BLEU using fine-tuned M2M-100. Our reproduction yields only 37.0 BLEU with the released model. Investigating this gap, we find \textbf{32{\%} of test targets appear identically in training} (16/50; 50{\%} under 8-gram overlap at 70{\%} threshold). This contamination inflates scores dramatically: contaminated samples achieve up to 83.8 BLEU / 0.924 COMET-22 versus 30.9{--}39.2 BLEU / 0.622{--}0.676 COMET-22 on clean samples across five model configurations spanning two architectures. Document-level decontamination reduces contaminated BLEU by only 4.6 points because 8/16 targets persist via other source documents{---}target-level deduplication is required. We release a decontaminated 34-sample test set and establish corrected baselines (30.9{--}39.2 BLEU), providing a realistic assessment of NMT capability for this endangered writing system."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toutou-etal-2026-data">
<titleInfo>
<title>Data Contamination in Neural Hieroglyphic Translation: A Reproducibility Study</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ammar</namePart>
<namePart type="family">Toutou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdelrahman</namePart>
<namePart type="family">Harb</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christine</namePart>
<namePart type="family">Basta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sil</namePart>
<namePart type="family">Hamilton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Öhman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="given">M</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Hicke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuri</namePart>
<namePart type="family">Bizzoni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Axel</namePart>
<namePart type="family">Bax</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jacob</namePart>
<namePart type="given">A</namePart>
<namePart type="family">Matthews</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mika</namePart>
<namePart type="family">Hämäläinen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-427-9</identifier>
</relatedItem>
<abstract>Ancient and endangered languages pose a unique challenge for NLP: their datasets are inherently scarce, difficult to expand, and built from formulaic corpora—making data-quality issues especially consequential yet rarely audited. Motivated by the need to understand what current NMT can realistically achieve for such languages, we investigate hieroglyphic-to-German translation, where a recent study reported 61.5 BLEU using fine-tuned M2M-100. Our reproduction yields only 37.0 BLEU with the released model. Investigating this gap, we find 32% of test targets appear identically in training (16/50; 50% under 8-gram overlap at 70% threshold). This contamination inflates scores dramatically: contaminated samples achieve up to 83.8 BLEU / 0.924 COMET-22 versus 30.9–39.2 BLEU / 0.622–0.676 COMET-22 on clean samples across five model configurations spanning two architectures. Document-level decontamination reduces contaminated BLEU by only 4.6 points because 8/16 targets persist via other source documents—target-level deduplication is required. We release a decontaminated 34-sample test set and establish corrected baselines (30.9–39.2 BLEU), providing a realistic assessment of NMT capability for this endangered writing system.</abstract>
<identifier type="citekey">toutou-etal-2026-data</identifier>
<location>
<url>https://aclanthology.org/2026.nlp4dh-1.6/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>50</start>
<end>57</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Contamination in Neural Hieroglyphic Translation: A Reproducibility Study
%A Toutou, Ammar
%A Harb, Abdelrahman
%A Basta, Christine
%Y Hamilton, Sil
%Y Öhman, Emily
%Y Hicke, Rebecca M. M.
%Y Bizzoni, Yuri
%Y Bax, Axel
%Y Matthews, Jacob A.
%Y Hämäläinen, Mika
%S Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, USA
%@ 979-8-89176-427-9
%F toutou-etal-2026-data
%X Ancient and endangered languages pose a unique challenge for NLP: their datasets are inherently scarce, difficult to expand, and built from formulaic corpora—making data-quality issues especially consequential yet rarely audited. Motivated by the need to understand what current NMT can realistically achieve for such languages, we investigate hieroglyphic-to-German translation, where a recent study reported 61.5 BLEU using fine-tuned M2M-100. Our reproduction yields only 37.0 BLEU with the released model. Investigating this gap, we find 32% of test targets appear identically in training (16/50; 50% under 8-gram overlap at 70% threshold). This contamination inflates scores dramatically: contaminated samples achieve up to 83.8 BLEU / 0.924 COMET-22 versus 30.9–39.2 BLEU / 0.622–0.676 COMET-22 on clean samples across five model configurations spanning two architectures. Document-level decontamination reduces contaminated BLEU by only 4.6 points because 8/16 targets persist via other source documents—target-level deduplication is required. We release a decontaminated 34-sample test set and establish corrected baselines (30.9–39.2 BLEU), providing a realistic assessment of NMT capability for this endangered writing system.
%U https://aclanthology.org/2026.nlp4dh-1.6/
%P 50-57
Markdown (Informal)
[Data Contamination in Neural Hieroglyphic Translation: A Reproducibility Study](https://aclanthology.org/2026.nlp4dh-1.6/) (Toutou et al., NLP4DH 2026)
ACL