@inproceedings{antoine-etal-2024-linguistically,
title = "A linguistically-motivated evaluation methodology for unraveling model{'}s abilities in reading comprehension tasks",
author = "Antoine, Elie and
Bechet, Frederic and
Damnati, G{\'e}raldine and
Langlais, Philippe",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1021/",
doi = "10.18653/v1/2024.emnlp-main.1021",
pages = "18376--18392",
abstract = "We introduce an evaluation methodology for reading comprehension tasks based on the intuition that certain examples, by the virtue of their linguistic complexity, consistently yield lower scores regardless of model size or architecture. We capitalize on semantic frame annotation for characterizing this complexity, and study seven complexity factors that may account for model{'}s difficulty. We first deploy this methodology on a carefully annotated French reading comprehension benchmark showing that two of those complexity factors are indeed good predictors of models' failure, while others are less so. We further deploy our methodology on a well studied English benchmark by using chatGPT as a proxy for semantic annotation.Our study reveals that fine-grained linguistically-motivated automatic evaluation of a reading comprehension task is not only possible, but helps understand models' abilities to handle specific linguistic characteristics of input examples. It also shows that current state-of-the-art models fail with some for those characteristics which suggests that adequately handling them requires more than merely increasing model size."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="antoine-etal-2024-linguistically">
<titleInfo>
<title>A linguistically-motivated evaluation methodology for unraveling model’s abilities in reading comprehension tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elie</namePart>
<namePart type="family">Antoine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frederic</namePart>
<namePart type="family">Bechet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Géraldine</namePart>
<namePart type="family">Damnati</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Langlais</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We introduce an evaluation methodology for reading comprehension tasks based on the intuition that certain examples, by the virtue of their linguistic complexity, consistently yield lower scores regardless of model size or architecture. We capitalize on semantic frame annotation for characterizing this complexity, and study seven complexity factors that may account for model’s difficulty. We first deploy this methodology on a carefully annotated French reading comprehension benchmark showing that two of those complexity factors are indeed good predictors of models’ failure, while others are less so. We further deploy our methodology on a well studied English benchmark by using chatGPT as a proxy for semantic annotation.Our study reveals that fine-grained linguistically-motivated automatic evaluation of a reading comprehension task is not only possible, but helps understand models’ abilities to handle specific linguistic characteristics of input examples. It also shows that current state-of-the-art models fail with some for those characteristics which suggests that adequately handling them requires more than merely increasing model size.</abstract>
<identifier type="citekey">antoine-etal-2024-linguistically</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1021</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1021/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>18376</start>
<end>18392</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A linguistically-motivated evaluation methodology for unraveling model’s abilities in reading comprehension tasks
%A Antoine, Elie
%A Bechet, Frederic
%A Damnati, Géraldine
%A Langlais, Philippe
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F antoine-etal-2024-linguistically
%X We introduce an evaluation methodology for reading comprehension tasks based on the intuition that certain examples, by the virtue of their linguistic complexity, consistently yield lower scores regardless of model size or architecture. We capitalize on semantic frame annotation for characterizing this complexity, and study seven complexity factors that may account for model’s difficulty. We first deploy this methodology on a carefully annotated French reading comprehension benchmark showing that two of those complexity factors are indeed good predictors of models’ failure, while others are less so. We further deploy our methodology on a well studied English benchmark by using chatGPT as a proxy for semantic annotation.Our study reveals that fine-grained linguistically-motivated automatic evaluation of a reading comprehension task is not only possible, but helps understand models’ abilities to handle specific linguistic characteristics of input examples. It also shows that current state-of-the-art models fail with some for those characteristics which suggests that adequately handling them requires more than merely increasing model size.
%R 10.18653/v1/2024.emnlp-main.1021
%U https://aclanthology.org/2024.emnlp-main.1021/
%U https://doi.org/10.18653/v1/2024.emnlp-main.1021
%P 18376-18392
Markdown (Informal)
[A linguistically-motivated evaluation methodology for unraveling model’s abilities in reading comprehension tasks](https://aclanthology.org/2024.emnlp-main.1021/) (Antoine et al., EMNLP 2024)
ACL