@inproceedings{zouhar-etal-2026-generating,
title = "Generating Difficult-to-Translate Texts",
author = "Zouhar, Vil{\'e}m and
Xu, Wenda and
Riley, Parker and
Juraska, Juraj and
Finkelstein, Mara and
Freitag, Markus and
Deutsch, Daniel",
editor = "Chen, Pinzhen and
Zouhar, Vil{\'e}m and
Hu, Hanxu and
Khanuja, Simran and
Zhu, Wenhao and
Haddow, Barry and
Birch, Alexandra and
Aji, Alham Fikri and
Sennrich, Rico and
Hooker, Sara",
booktitle = "Proceedings of the First Workshop on Multilingual Multicultural Evaluation",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.mme-main.14/",
pages = "204--219",
ISBN = "979-8-89176-368-5",
abstract = "Machine translation benchmarks sourced from the real world are quickly obsoleted, due to most examples being easy for state-of-the-art translation models. This limits the benchmark{'}s ability to distinguish which model is better or to reveal models' weaknesses. Current methods for creating difficult test cases, such as subsampling or from-scratch synthesis, either fall short of identifying difficult examples or suffer from a lack of diversity and naturalness. Inspired by the iterative process of human experts probing for model failures, we propose MT-breaker, a method where a large language model iteratively refines a source text to increase its translation difficulty. The LLM iteratively queries a target machine translation model to guide its generation of difficult examples. Our approach generates examples that are more challenging for the target MT model while preserving the diversity of natural texts. While the examples are tailored to a particular machine translation model during the generation, the difficulty also transfers to other models and languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zouhar-etal-2026-generating">
<titleInfo>
<title>Generating Difficult-to-Translate Texts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vilém</namePart>
<namePart type="family">Zouhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenda</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parker</namePart>
<namePart type="family">Riley</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juraj</namePart>
<namePart type="family">Juraska</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mara</namePart>
<namePart type="family">Finkelstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Markus</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Multilingual Multicultural Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pinzhen</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vilém</namePart>
<namePart type="family">Zouhar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanxu</namePart>
<namePart type="family">Hu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simran</namePart>
<namePart type="family">Khanuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenhao</namePart>
<namePart type="family">Zhu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">Birch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alham</namePart>
<namePart type="given">Fikri</namePart>
<namePart type="family">Aji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rico</namePart>
<namePart type="family">Sennrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Hooker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-368-5</identifier>
</relatedItem>
<abstract>Machine translation benchmarks sourced from the real world are quickly obsoleted, due to most examples being easy for state-of-the-art translation models. This limits the benchmark’s ability to distinguish which model is better or to reveal models’ weaknesses. Current methods for creating difficult test cases, such as subsampling or from-scratch synthesis, either fall short of identifying difficult examples or suffer from a lack of diversity and naturalness. Inspired by the iterative process of human experts probing for model failures, we propose MT-breaker, a method where a large language model iteratively refines a source text to increase its translation difficulty. The LLM iteratively queries a target machine translation model to guide its generation of difficult examples. Our approach generates examples that are more challenging for the target MT model while preserving the diversity of natural texts. While the examples are tailored to a particular machine translation model during the generation, the difficulty also transfers to other models and languages.</abstract>
<identifier type="citekey">zouhar-etal-2026-generating</identifier>
<location>
<url>https://aclanthology.org/2026.mme-main.14/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>204</start>
<end>219</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Generating Difficult-to-Translate Texts
%A Zouhar, Vilém
%A Xu, Wenda
%A Riley, Parker
%A Juraska, Juraj
%A Finkelstein, Mara
%A Freitag, Markus
%A Deutsch, Daniel
%Y Chen, Pinzhen
%Y Zouhar, Vilém
%Y Hu, Hanxu
%Y Khanuja, Simran
%Y Zhu, Wenhao
%Y Haddow, Barry
%Y Birch, Alexandra
%Y Aji, Alham Fikri
%Y Sennrich, Rico
%Y Hooker, Sara
%S Proceedings of the First Workshop on Multilingual Multicultural Evaluation
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-368-5
%F zouhar-etal-2026-generating
%X Machine translation benchmarks sourced from the real world are quickly obsoleted, due to most examples being easy for state-of-the-art translation models. This limits the benchmark’s ability to distinguish which model is better or to reveal models’ weaknesses. Current methods for creating difficult test cases, such as subsampling or from-scratch synthesis, either fall short of identifying difficult examples or suffer from a lack of diversity and naturalness. Inspired by the iterative process of human experts probing for model failures, we propose MT-breaker, a method where a large language model iteratively refines a source text to increase its translation difficulty. The LLM iteratively queries a target machine translation model to guide its generation of difficult examples. Our approach generates examples that are more challenging for the target MT model while preserving the diversity of natural texts. While the examples are tailored to a particular machine translation model during the generation, the difficulty also transfers to other models and languages.
%U https://aclanthology.org/2026.mme-main.14/
%P 204-219
Markdown (Informal)
[Generating Difficult-to-Translate Texts](https://aclanthology.org/2026.mme-main.14/) (Zouhar et al., MME 2026)
ACL
- Vilém Zouhar, Wenda Xu, Parker Riley, Juraj Juraska, Mara Finkelstein, Markus Freitag, and Daniel Deutsch. 2026. Generating Difficult-to-Translate Texts. In Proceedings of the First Workshop on Multilingual Multicultural Evaluation, pages 204–219, Rabat, Morocco. Association for Computational Linguistics.