@inproceedings{chousa-hirao-2025-automatic,
title = "Automatic Evaluation of Language Generation Technology Based on Structure Alignment",
author = "Chousa, Katsuki and
Hirao, Tsutomu",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.512/",
pages = "7663--7670",
abstract = "Language generation techniques require automatic evaluation to carry out efficient and reproducible experiments. While n-gram matching is standard, it fails to capture semantic equivalence with different wording. Recent methods have addressed this issue by using contextual embeddings from pre-trained language models to compute the similarity between reference and hypothesis. However, these methods frequently disregard the syntax of sentences, despite its crucial role in determining meaning, and thus assign unjustifiably high scores. This paper proposes an automatic evaluation metric that considers both the words in sentences and their syntactic structures. We integrate syntactic information into the recent embedding-based approach. Experimental results obtained from two NLP tasks show that our method is at least comparable to standard baselines."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chousa-hirao-2025-automatic">
<titleInfo>
<title>Automatic Evaluation of Language Generation Technology Based on Structure Alignment</title>
</titleInfo>
<name type="personal">
<namePart type="given">Katsuki</namePart>
<namePart type="family">Chousa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tsutomu</namePart>
<namePart type="family">Hirao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language generation techniques require automatic evaluation to carry out efficient and reproducible experiments. While n-gram matching is standard, it fails to capture semantic equivalence with different wording. Recent methods have addressed this issue by using contextual embeddings from pre-trained language models to compute the similarity between reference and hypothesis. However, these methods frequently disregard the syntax of sentences, despite its crucial role in determining meaning, and thus assign unjustifiably high scores. This paper proposes an automatic evaluation metric that considers both the words in sentences and their syntactic structures. We integrate syntactic information into the recent embedding-based approach. Experimental results obtained from two NLP tasks show that our method is at least comparable to standard baselines.</abstract>
<identifier type="citekey">chousa-hirao-2025-automatic</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.512/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>7663</start>
<end>7670</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Evaluation of Language Generation Technology Based on Structure Alignment
%A Chousa, Katsuki
%A Hirao, Tsutomu
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F chousa-hirao-2025-automatic
%X Language generation techniques require automatic evaluation to carry out efficient and reproducible experiments. While n-gram matching is standard, it fails to capture semantic equivalence with different wording. Recent methods have addressed this issue by using contextual embeddings from pre-trained language models to compute the similarity between reference and hypothesis. However, these methods frequently disregard the syntax of sentences, despite its crucial role in determining meaning, and thus assign unjustifiably high scores. This paper proposes an automatic evaluation metric that considers both the words in sentences and their syntactic structures. We integrate syntactic information into the recent embedding-based approach. Experimental results obtained from two NLP tasks show that our method is at least comparable to standard baselines.
%U https://aclanthology.org/2025.coling-main.512/
%P 7663-7670
Markdown (Informal)
[Automatic Evaluation of Language Generation Technology Based on Structure Alignment](https://aclanthology.org/2025.coling-main.512/) (Chousa & Hirao, COLING 2025)
ACL