@inproceedings{yang-etal-2026-ssr,
title = "{SSR}-Zero: Simple Self-Rewarding Reinforcement Learning for Machine Translation",
author = "Yang, Wenjie and
Zheng, Mao and
Song, Mingyang and
Li, Zheng and
Wang, Sitong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.300/",
pages = "6039--6052",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) have recently demonstrated remarkable capabilities in machine translation (MT). However, most advanced MT-specific LLMs rely heavily on external supervision during training, such as human-annotated reference data or trained reward models (RMs), which are expensive to obtain and difficult to scale. To address this limitation, we propose **Simple Self-Rewarding (SSR)**, a reinforcement learning (RL) framework for MT that is reference-free and relies solely on self-judging rewards. Using only 13K monolingual examples and Qwen-2.5-7B as the backbone, SSR-Zero-7B outperforms existing MT-specific LLMs as well as larger general LLMs such as Qwen2.5-32B-Instruct on English $\leftrightarrow$ Chinese translation benchmarks including WMT23, WMT24, and FLORES200. It further demonstrates strong generalization to low-resource language pairs. In addition, when augmented with external supervision from COMET, our strongest model, SSR-X-Zero-7B, surpasses all existing open-source models under 72B parameters and performs competitively with leading closed-source systems in English $\leftrightarrow$ Chinese translation. Our analysis highlights the effectiveness and generalizability of the self-rewarding mechanism relative to external LLM-as-a-judge approaches and demonstrates its complementary benefits when combined with trained RMs. We will publicly release our code, data, and models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yang-etal-2026-ssr">
<titleInfo>
<title>SSR-Zero: Simple Self-Rewarding Reinforcement Learning for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mao</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingyang</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zheng</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sitong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have recently demonstrated remarkable capabilities in machine translation (MT). However, most advanced MT-specific LLMs rely heavily on external supervision during training, such as human-annotated reference data or trained reward models (RMs), which are expensive to obtain and difficult to scale. To address this limitation, we propose **Simple Self-Rewarding (SSR)**, a reinforcement learning (RL) framework for MT that is reference-free and relies solely on self-judging rewards. Using only 13K monolingual examples and Qwen-2.5-7B as the backbone, SSR-Zero-7B outperforms existing MT-specific LLMs as well as larger general LLMs such as Qwen2.5-32B-Instruct on English łeftrightarrow Chinese translation benchmarks including WMT23, WMT24, and FLORES200. It further demonstrates strong generalization to low-resource language pairs. In addition, when augmented with external supervision from COMET, our strongest model, SSR-X-Zero-7B, surpasses all existing open-source models under 72B parameters and performs competitively with leading closed-source systems in English łeftrightarrow Chinese translation. Our analysis highlights the effectiveness and generalizability of the self-rewarding mechanism relative to external LLM-as-a-judge approaches and demonstrates its complementary benefits when combined with trained RMs. We will publicly release our code, data, and models.</abstract>
<identifier type="citekey">yang-etal-2026-ssr</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.300/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>6039</start>
<end>6052</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SSR-Zero: Simple Self-Rewarding Reinforcement Learning for Machine Translation
%A Yang, Wenjie
%A Zheng, Mao
%A Song, Mingyang
%A Li, Zheng
%A Wang, Sitong
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F yang-etal-2026-ssr
%X Large language models (LLMs) have recently demonstrated remarkable capabilities in machine translation (MT). However, most advanced MT-specific LLMs rely heavily on external supervision during training, such as human-annotated reference data or trained reward models (RMs), which are expensive to obtain and difficult to scale. To address this limitation, we propose **Simple Self-Rewarding (SSR)**, a reinforcement learning (RL) framework for MT that is reference-free and relies solely on self-judging rewards. Using only 13K monolingual examples and Qwen-2.5-7B as the backbone, SSR-Zero-7B outperforms existing MT-specific LLMs as well as larger general LLMs such as Qwen2.5-32B-Instruct on English łeftrightarrow Chinese translation benchmarks including WMT23, WMT24, and FLORES200. It further demonstrates strong generalization to low-resource language pairs. In addition, when augmented with external supervision from COMET, our strongest model, SSR-X-Zero-7B, surpasses all existing open-source models under 72B parameters and performs competitively with leading closed-source systems in English łeftrightarrow Chinese translation. Our analysis highlights the effectiveness and generalizability of the self-rewarding mechanism relative to external LLM-as-a-judge approaches and demonstrates its complementary benefits when combined with trained RMs. We will publicly release our code, data, and models.
%U https://aclanthology.org/2026.findings-acl.300/
%P 6039-6052
Markdown (Informal)
[SSR-Zero: Simple Self-Rewarding Reinforcement Learning for Machine Translation](https://aclanthology.org/2026.findings-acl.300/) (Yang et al., Findings 2026)
ACL