@inproceedings{samo-etal-2023-blm,
title = "{BLM}-s/l{E}: A structured dataset of {E}nglish spray-load verb alternations for testing generalization in {LLM}s",
author = "Samo, Giuseppe and
Nastase, Vivi and
Jiang, Chunyang and
Merlo, Paola",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.821",
doi = "10.18653/v1/2023.findings-emnlp.821",
pages = "12276--12287",
abstract = "Current NLP models appear to be achieving performance comparable to human capabilities on well-established benchmarks. New benchmarks are now necessary to test deeper layers of understanding of natural languages by these models. Blackbird{'}s Language Matrices are a recently developed framework that draws inspiration from tests of human analytic intelligence. The BLM task has revealed that successful performances in previously studied linguistic problems do not yet stem from a deep understanding of the generative factors that define these problems. In this study, we define a new BLM task for predicate-argument structure, and develop a structured dataset for its investigation, concentrating on the spray-load verb alternations in English, as a case study. The context sentences include one alternant from the spray-load alternation and the target sentence is the other alternant, to be chosen among a minimally contrastive and adversarial set of answers. We describe the generation process of the dataset and the reasoning behind the generating rules. The dataset aims to facilitate investigations into how verb information is encoded in sentence embeddings and how models generalize to the complex properties of argument structures. Benchmarking experiments conducted on the dataset and qualitative error analysis on the answer set reveal the inherent challenges associated with the problem even for current high-performing representations.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samo-etal-2023-blm">
<titleInfo>
<title>BLM-s/lE: A structured dataset of English spray-load verb alternations for testing generalization in LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Samo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivi</namePart>
<namePart type="family">Nastase</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chunyang</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current NLP models appear to be achieving performance comparable to human capabilities on well-established benchmarks. New benchmarks are now necessary to test deeper layers of understanding of natural languages by these models. Blackbird’s Language Matrices are a recently developed framework that draws inspiration from tests of human analytic intelligence. The BLM task has revealed that successful performances in previously studied linguistic problems do not yet stem from a deep understanding of the generative factors that define these problems. In this study, we define a new BLM task for predicate-argument structure, and develop a structured dataset for its investigation, concentrating on the spray-load verb alternations in English, as a case study. The context sentences include one alternant from the spray-load alternation and the target sentence is the other alternant, to be chosen among a minimally contrastive and adversarial set of answers. We describe the generation process of the dataset and the reasoning behind the generating rules. The dataset aims to facilitate investigations into how verb information is encoded in sentence embeddings and how models generalize to the complex properties of argument structures. Benchmarking experiments conducted on the dataset and qualitative error analysis on the answer set reveal the inherent challenges associated with the problem even for current high-performing representations.</abstract>
<identifier type="citekey">samo-etal-2023-blm</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.821</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.821</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>12276</start>
<end>12287</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BLM-s/lE: A structured dataset of English spray-load verb alternations for testing generalization in LLMs
%A Samo, Giuseppe
%A Nastase, Vivi
%A Jiang, Chunyang
%A Merlo, Paola
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F samo-etal-2023-blm
%X Current NLP models appear to be achieving performance comparable to human capabilities on well-established benchmarks. New benchmarks are now necessary to test deeper layers of understanding of natural languages by these models. Blackbird’s Language Matrices are a recently developed framework that draws inspiration from tests of human analytic intelligence. The BLM task has revealed that successful performances in previously studied linguistic problems do not yet stem from a deep understanding of the generative factors that define these problems. In this study, we define a new BLM task for predicate-argument structure, and develop a structured dataset for its investigation, concentrating on the spray-load verb alternations in English, as a case study. The context sentences include one alternant from the spray-load alternation and the target sentence is the other alternant, to be chosen among a minimally contrastive and adversarial set of answers. We describe the generation process of the dataset and the reasoning behind the generating rules. The dataset aims to facilitate investigations into how verb information is encoded in sentence embeddings and how models generalize to the complex properties of argument structures. Benchmarking experiments conducted on the dataset and qualitative error analysis on the answer set reveal the inherent challenges associated with the problem even for current high-performing representations.
%R 10.18653/v1/2023.findings-emnlp.821
%U https://aclanthology.org/2023.findings-emnlp.821
%U https://doi.org/10.18653/v1/2023.findings-emnlp.821
%P 12276-12287
Markdown (Informal)
[BLM-s/lE: A structured dataset of English spray-load verb alternations for testing generalization in LLMs](https://aclanthology.org/2023.findings-emnlp.821) (Samo et al., Findings 2023)
ACL