@inproceedings{harrison-walker-2026-cross,
title = "Cross-Domain Semantic Fidelity Evaluation for Meaning-to-Text Generation",
author = "Harrison, Davan and
Walker, Marilyn",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.41/",
pages = "443--455",
ISBN = "979-8-89176-423-1",
abstract = "Slot Error Rate (SER) is the standard metric for evaluating semantic accuracy in meaning-to-text generation, but computing it has historically required domain-specific scripts that do not generalize across datasets. We present a cross-domain SER evaluation framework that replaces hand-crafted rules with a learned slot extraction model. We adapt Llama-3.2-3B-Instruct with LoRA, updating only 0.34{\%} of its parameters, and show that this small adapted model outperforms prompted frontier LLMs by a wide margin on structured extraction across 23 dialogue domains. We further apply overgenerate-and-rank to the extraction task itself, generating multiple candidate meaning representations and selecting the best one with a trained ranker, which improves SER-Accuracy from 75{\%} to 88{\%}. We combine the extraction model with a Natural Language Inference (NLI) verification baseline through learned per-example routing, achieving 90.0{\%} accuracy on held-out evaluation pairs without any domain-specific rule engineering. We compare our framework against published rule-based SER tools and show that our learned approach matches or outperforms hand-crafted scripts on all six comparable domains."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="harrison-walker-2026-cross">
<titleInfo>
<title>Cross-Domain Semantic Fidelity Evaluation for Meaning-to-Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Davan</namePart>
<namePart type="family">Harrison</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marilyn</namePart>
<namePart type="family">Walker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Slot Error Rate (SER) is the standard metric for evaluating semantic accuracy in meaning-to-text generation, but computing it has historically required domain-specific scripts that do not generalize across datasets. We present a cross-domain SER evaluation framework that replaces hand-crafted rules with a learned slot extraction model. We adapt Llama-3.2-3B-Instruct with LoRA, updating only 0.34% of its parameters, and show that this small adapted model outperforms prompted frontier LLMs by a wide margin on structured extraction across 23 dialogue domains. We further apply overgenerate-and-rank to the extraction task itself, generating multiple candidate meaning representations and selecting the best one with a trained ranker, which improves SER-Accuracy from 75% to 88%. We combine the extraction model with a Natural Language Inference (NLI) verification baseline through learned per-example routing, achieving 90.0% accuracy on held-out evaluation pairs without any domain-specific rule engineering. We compare our framework against published rule-based SER tools and show that our learned approach matches or outperforms hand-crafted scripts on all six comparable domains.</abstract>
<identifier type="citekey">harrison-walker-2026-cross</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.41/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>443</start>
<end>455</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Cross-Domain Semantic Fidelity Evaluation for Meaning-to-Text Generation
%A Harrison, Davan
%A Walker, Marilyn
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F harrison-walker-2026-cross
%X Slot Error Rate (SER) is the standard metric for evaluating semantic accuracy in meaning-to-text generation, but computing it has historically required domain-specific scripts that do not generalize across datasets. We present a cross-domain SER evaluation framework that replaces hand-crafted rules with a learned slot extraction model. We adapt Llama-3.2-3B-Instruct with LoRA, updating only 0.34% of its parameters, and show that this small adapted model outperforms prompted frontier LLMs by a wide margin on structured extraction across 23 dialogue domains. We further apply overgenerate-and-rank to the extraction task itself, generating multiple candidate meaning representations and selecting the best one with a trained ranker, which improves SER-Accuracy from 75% to 88%. We combine the extraction model with a Natural Language Inference (NLI) verification baseline through learned per-example routing, achieving 90.0% accuracy on held-out evaluation pairs without any domain-specific rule engineering. We compare our framework against published rule-based SER tools and show that our learned approach matches or outperforms hand-crafted scripts on all six comparable domains.
%U https://aclanthology.org/2026.gem-main.41/
%P 443-455
Markdown (Informal)
[Cross-Domain Semantic Fidelity Evaluation for Meaning-to-Text Generation](https://aclanthology.org/2026.gem-main.41/) (Harrison & Walker, GEM 2026)
ACL