\pdfoutput=1
\documentclass[12pt]{article}
\usepackage{ACL2023}
\usepackage{times}
\usepackage{latexsym}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{microtype}
\usepackage{inconsolata}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{adjustbox}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{url}
% \usepackage{biblatex} %Imports biblatex package
% \addbibresource{references.bib} %Import the bibliography file

\author{Nina Skovgaard Schneidermann$^1$, Daniel Hershcovich$^2$ \and \\ \textbf{Bolette Sandford Pedersen}$^1$
 \\
  $^{1}$Center for Language Technology, \\$^{2}$Department of Computer Science\\University of Copenhagen\\
  \texttt{ninasc@hum.ku.dk, dh@di.ku.dk, bspedersen@hum.ku.dk}}

\title{Probing for Hyperbole in Pre-Trained Language Models}

\begin{document}

\maketitle

\begin{abstract}
Hyperbole is a common figure of speech, which is under-explored in NLP research. In this study, we conduct edge and minimal description length (MDL) probing experiments for three pre-trained language models (PLMs) in an attempt to explore the extent to which hyperbolic information is encoded in these models. We use both word-in-context and sentence-level representations as model inputs as a basis for comparison. We also annotate 63 hyperbole sentences from the HYPO dataset according to an operational taxonomy to conduct an error analysis to explore the encoding of different hyperbole categories. Our results show that hyperbole is to a limited extent encoded in PLMs, and mostly in the final layers. They also indicate that hyperbolic information may be better encoded by the sentence-level representations, which, due to the pragmatic nature of hyperbole, may therefore provide a more accurate and informative representation in PLMs. Finally, the inter-annotator agreement for our annotations, a Cohen's Kappa of 0.339, suggest that the taxonomy categories may not be intuitive and need revision or simplification.
\end{abstract}

\section{Introduction}
\label{sec:introduction}

Hyperbole is a common figure of speech that involves the use of exaggerated language for emphasis or effect \cite{claridge2010hyperbole}. Humans exaggerate in a variety of registers and contexts, spanning from the colouring of informal, everyday speech to a literary trope or a rhetorical means of persuasion. Hyperboles intentionally augment or diminish a feature of some referent of discourse, presenting this feature on some more or less abstract scale of magnitude. The task of hyperbole identification poses a challenge to natural language processing in that it is highly pragmatic and utilizes context and background knowledge to distinguish between literal and exaggerated usage of a given lexical unit. As an illustration of the pragmatic nature of hyperbole, we can inspect the following two example sentences, wherein (1A) is hyperbolic and (1B) is literal:

{\small
\noindent (1A) I've seen this movie \emph{at least eighty thousand times}. \\
\noindent (1B) These products are tested \emph{at least eighty thousand times}.
}

In (1A), it is reasonable to assume that the speaker is exaggerating the number of times they have seen this particular movie to emphasize their enjoyment or familiarity with it because this would otherwise be a significant and unrealistic time investment. However, when it comes to a particular product, it has likely gone through rigorous testing and quality control measures, which means that the statement in (1B) can reasonably be interpreted literally.

Hyperbole identification has recently attracted the interest of NLP researchers who have collected datasets manually or semi-automatically and shown that computational modelling of hyperbole is indeed plausible \cite{troiano2018computational}. However, it remains an under-explored area of research in figurative language processing (FLP), primarily because its subjective and contextual nature complicates computational modelling of the phenomenon and makes it challenging to apply a standard for collecting high-quality annotated data \cite{biddle2021harnessing}.

This paper seeks to contribute to the growing research on hyperbole identification in two ways: Firstly, we perform probing tasks to investigate whether pre-trained language models (PLMs) encode hyperbolic information in its representation without fine-tuning on task-specific data.\footnote{By ``hyperbolic'', we consistently refer to the figure of speech, not the mathematical space.}
In recent years, probing tasks have emerged as a popular approach in NLP for interpreting and analyzing model representations, and it has previously been shown that PLMs do encode both simile and metaphorical knowledge \cite{chenProbingSimileKnowledge2022}. However, to our knowledge, hyperbole probing remains so far unexplored. Therefore, we replicate edge and minimal description length (MDL) probing experiments for metaphor described by \citet{aghazadehMetaphorsPreTrainedLanguage2022} on a small hyperbole dataset constructed by \citet{troiano2018computational}. We expect that encoding hyperbole may present a larger challenge to PLMs than metaphor because hyperbole knowledge is primarily pragmatic rather than semantic \cite{mccarthyThereMillionsThem2004}.

Secondly, we build an operational taxonomy based on a meta-analysis of the linguistic treatment of hyperbole, and annotate an existing dataset according to said taxonomy \cite{mccarthyThereMillionsThem2004, mora2009all,claridge2010hyperbole, burgers2016hip,troiano2018computational}. We then use these annotations to analyze errors in model predictions to further shed light on the types of hyperboles that may pose a particular challenge to PLMs, as well as when constructing training corpora for the phenomenon. Our work will hopefully provide insight into the challenges of PLMs in identifying hyperbole, as well as contribute to developing an operational annotation standard for computational modelling of hyperbole.\footnote{Our code for the probing tasks is available at \url{https://github.com/NiSc91/HyperboleProbe}}

The remainder of this paper is structured as follows: Section 2 contains an overview of related work in hyperbole research, as well as probing experiments on other figures of speech. Section 3 provides a background on the linguistic research that is the framework for our operational taxonomy and annotation. Section 4 is a short explanation of probing tasks for PLMs, which we relate to the aim of our experiments. Section 5 outlines our experimental setup and describes the modifications made to the HYPO dataset. Section 6 provides our results and preliminary error analysis, and section 7 is a discussion of said results, as well as ideas for future research. Section 8 contains a summary and conclusions.

\section{Related Work}

In this section, we outline previous research related to both hyperbole and probing experiments on other figures of speech.

\paragraph{Hyperbole in NLP.}
While tropes such as metaphor and sarcasm have received considerable attention within figurative language processing research \cite{abulaish2020survey, rai2020survey, moores2022survey}, the automatic modelling of hyperbole is still at a relatively early stage. Research within this area can be roughly split into two objectives, hyperbole identification (HI) and hyperbole generation (HG).

Within the first, and for our purposes most interesting, category, \citet{troiano2018computational} introduce the task of hyperbole detection by showing that classical machine learning pipelines can identify hyperboles with beyond-chance accuracy. For this purpose, they collect HYPO, the only manually constructed corpus of 709 English hyperboles, and include with the hyperbolic sentence   s two contrasting corpora: One consisting of the manually constructed literal paraphrases to each of the sentences, and another consisting of a contrastive non-hyperbolic example using the same minimal lexical unit. They then identify a set of hand-crafted features targeting qualitative and quantitative aspects of exaggeration and report the best-performing classifier to be logistic regression using the literal paraphrases as negative examples, which achieves a 76\% F1 score. In the same realm, \citet{kongIdentifyingExaggeratedLanguage2020} address hyperbole detection using deep learning techniques on a constructed Chinese corpus and find that an LSTM with hand-crafted and embedding features produced superior results (85.4\% accuracy).
\citet{biddle2021harnessing} construct a multitask learning classification architecture for hyperbole detection using a multi-task BERT-based approach, wherein the model is fine-tuned on the HYPO dataset and takes the literal paraphrases as privileged information using triplet sampling. The authors find that their model improves the logistic regression baseline described by \citet{troiano2018computational} by 10\%. The authors also devise a series of test sentences to linguistically probe their model for extreme case formulations (ECFs), quantitative, and qualitative hyperboles, as described by \citet{mora2009all}, and find that their model particularly excels at hyperboles containing ECFs, which may be due to the lexical substitution between the hyperbole and the literal paraphrase being minimal.

Recent frameworks have also leveraged pre-trained language models to generate hyperbole and expand on existing hyperbole data in a semi-supervised way. Specifically, \citet{tianHypoGenHyperboleGeneration2021} construct a sentence-level hyperbole generation model by fine-tuning it on sentences from a Reddit corpus using the syntactic pattern known as the ``so ... that'' pattern, which is said to be a productive strategy for hyperbole \cite{mccarthyThereMillionsThem2004}. The authors annotate the data with semantic relationships within the sentence and feed the annotations to COMeT models \cite{bosselut-etal-2019-comet} trained to generate commonsense and counterfactual inference. They then train a classifier to rank hyperbole candidates and use a paraphrase model to generalize to more syntactic patterns. An HG approach by \citet{zhang2021mover} involves constructing a large-scale hyperbole corpus, HypoXL, and proposes an unsupervised approach to hyperbole generation wherein a fine-tuned BART model is used to fill in masked hyperbolic spans.

While these efforts point towards the possibility of successfully training computational models for the task of identifying hyperbole, the research so far also has significant gaps:
Firstly, hyperbole in NLP lacks a unifying definition or linguistically motivated formal theory to describe the phenomenon. This is reflected in a lack of a consistent annotation scheme and procedure for hyperbole identification in the available data, which makes hyperbole studies relatively far behind investigations of metaphor, where most annotated data use either the Metaphor Identification Procedure and its extensions \cite[MIP/MIPVU;][]{groupMIPMethodIdentifying2007,steen2019mipvu}, or Conceptual Metaphor Theory \cite[CMT;][]{lakoff1980conceptual} as a procedure for annotation. This consistency of theoretical framework and annotation procedure makes it easier to perform experiments generalizing across languages and datasets.
Secondly, limited attempts have been made to probe pre-trained language models on how well they encode hyperbole without any fine-tuning. This makes it unclear whether models simply reconstruct the hyperboles found in the fine-tuning objective, and how well the model is able to learn hyperbolic information in a zero-shot or few-shot setting.

Our experiment is, to our knowledge, the first one to not utilize a fine-tuned model on hyperbolic sentences and to instead use probing methods to test for the encoding of hyperbolic information in PLMs.

\begin{figure*}[t] 
\centering 
\begin{subfigure}[b]{\textwidth}
\centering 
\includegraphics[width=\textwidth]{dimension_new.png}
\caption{Subtree and examples for the Dimension category.}
\end{subfigure}
\begin{subfigure}[b]{\textwidth}
\centering 
\includegraphics[width=\textwidth]{type_new.png}
\caption{Subtree and examples for the Type category.}
\end{subfigure}
\caption{The first two categories in the proposed taxonomy for hyperbole with examples for each.}
\label{fig:taxonomy1}
\end{figure*}
\begin{figure}[t] 
\centering 
\begin{subfigure}[b]{\columnwidth}
	\centering 
\includegraphics[width=\linewidth]{possibility_new.png} 
\caption{Subtree and examples for the Possibility category.}
\end{subfigure}
\begin{subfigure}[b]{\columnwidth}
	\centering 
\includegraphics[width=\linewidth]{conventionality_new.png}  
\caption{Subtree and examples for the Conventionality category.}
\end{subfigure}
\caption{The last two categories in the taxonomy.}
\label{fig:taxonomy2}
\end{figure}

\paragraph{Probing PLMs for Figurative Language Information.}
Probing techniques provide ways to understand and interpret the internal representations learned by deep neural networks \cite{belinkovProbingClassifiersPromises2022}. They typically involve extracting particular features or representations from a model's intermediate layers to gain insights into its structure or decision-making process.
Several recent experiments have been designed to probe PLMs for information on figurative language. Namely, \citet{chenProbingSimileKnowledge2022} tackle similarity interpretation (SI) and generation (SG) tasks by probing simile knowledge from PLMs by testing it on similarity triple completion, i.e. sentences that take the form \textit{[NP1] is as [ADJ] as [NP2]}. Their approach is to manually construct masked sentences with this syntactic pattern and predict the candidate words in the masked position. To that end, they adopt an auxiliary training process with the MLM loss to enhance the prediction diversity of candidate words. While this kind of probing works well to generate particular syntactic constructions, it would be ineffective for hyperbole due to its relatively limited dependence on syntax.

Instead, we choose to adapt several experiments conducted for metaphor probing by \citet{aghazadehMetaphorsPreTrainedLanguage2022} for hyperbole.
The authors conduct probing in two ways: First, they train a linear probing classifier on 3 different PLMs to evaluate the accuracies and extractabilities with which they encode metaphorical knowledge. Secondly, they use MDL probing to analyze the depth of the encoding of metaphorical information in multi-layer representations. The authors further extend their experiment by generalizing across four datasets and four languages. The results suggest that contextual representations in PLMs do encode metaphorical knowledge, mostly in their middle layers, and that it is possible to transfer this information across languages and datasets provided the annotation is consistent across training and testing sets.

While we can replicate the basic probing experiments, we cannot test the model's generalizability given the scarce hyperbole data. However, we do expect that it is possible via these techniques to learn something about the internal representations of hyperbole.

\section{A Taxonomy for Hyperbole}
\label{sec:taxonomy}

In simple terms, hyperbole involves exaggerating a feature's property X beyond what is justified by the literal state of affairs \cite{claridge2010hyperbole, troiano2018computational}. Stated in a more discourse-centred way, hyperbole occurs when an expression is more extreme than justified given the ontological referent, i.e. the entity in the world referenced by the text \cite{burgers2016hip}. While much of the work on hyperbole has previously been subsumed under studies of metaphor, humour, and verbal irony, recent corpus linguistic analyses have shed light on more fine-grained characteristics. Namely, the consensus in the treatment of hyperbole in literature is that the phenomenon is, among others, characterized by the presence of extreme case formulations (ECF), the ability of hyperbole to create either extreme possible worlds or downright counterfactual and absurd scenarios, and its augmentation of some property along a qualitative or quantitative scale \cite{mccarthyThereMillionsThem2004, mora2009all, claridge2010hyperbole}.

In the following, we outline some of the key characteristics and visualize them in an operational taxonomy (see Figures~\ref{fig:taxonomy1}~and~\ref{fig:taxonomy2}).

\paragraph{Dimension.}
There is widespread agreement that hyperbole occurs on a scale of magnitude along two main dimensions: a quantitative scale and a qualitative scale \cite{mora2009all, claridge2010hyperbole, troiano2018computational}. The distinction between these scales refers to whether a hyperbole primarily concerns objective and measurable aspects or subjective and evaluative emotional states of affairs.
According to \citet{mora2009all}, who conducted a corpus analysis of natural conversation on a 52000 word subset of the British National Corpus (BNC), quantitative hyperboles comprise 61\% of the analyzed hyperboles and include the semantic fields of completeness, universality, measure, and magnitude. Qualitative (evaluative) hyperboles concern positive or negative sentiments, as well as impact or singularity; e.g. 'shocking', 'smashing' etc.
However, an important point to make here is that there is a significant overlap between these dimensions, as hyperboles will generally have an evaluative function: For instance, the expression that somebody has ``piles of batteries in their room'' could be said to be a negative evaluation of the state of the room, but we choose to annotate such expressions as primarily quantitative, as the exaggerated property is one of measure.
Another potentially relevant distinction is that quantitative hyperboles have a verifiable element, whereas purely qualitative hyperboles often serve to convey an internal subjective mental or emotional state \cite{claridge2010hyperbole}: For instance, in the statement, \textit{It was the worst meal I have ever had}, the speaker could either be conveying their honest opinion of the meal, or they could be using exaggeration as a figure of speech to emphasize their disappointment with the meal.

\paragraph{Type.}
We use the term ``type'' to refer to whether the hyperbole is basic or composite, i.e., whether it stands alone or is combined with another figure of speech. According to \citet{claridge2010hyperbole}, hyperboles are basic if they preserve the semantic domain of the corresponding literal paraphrase, and composite if it involves a domain transfer where elements of a source domain is mapped onto a target domain. The latter is primarily the case with metaphor and, to a lesser extent, metonymy <cite{claridge2010hyperbole}.
In our annotations, we analyze simile as domain-preserving, even though we recognize that simile can be analyzed as an explicit metaphor \cite{burgers2018metaphor}.

\paragraph{Degree of possibility.}
This distinction is one of degree and refers to the extent to which hyperboles generate impossible, absurd, or counterfactual scenarios. This is purely pragmatic and influences the degree to which a statement may be perceived as hyperbolic \cite{mccarthyThereMillionsThem2004,troiano2018computational}.

\paragraph{Level of conventionality.}
This last dichotomy refers to the fact that hyperboles can use either more conventional or more novel and creative language to express exaggeration. This also impacts the extent to which a statement is perceived as a hyperbole: For instance, to say that one has not seen a person \textit{for ages} is so frequent that it could be considered a latent or dead hyperbole, in the sense that it might not be viewed as intentional exaggeration for a specific purpose \cite{mccarthyThereMillionsThem2004}. However, in our annotation, we do label such frequent sentences as hyperbolic, although a conventionalized one.

\section{Probing PLMs for Hyperbole}

Probing language models aims to answer questions related to the model's internal representation, such as the location and depth of the encoding of a linguistic property in the multi-layer representation, or which input features contributed to a particular behaviour of the PLM \cite{belinkovProbingClassifiersPromises2022}. Standard probing methods involve training a linear classifier on top of a PLM to predict a linguistic property of interest, where a high probing performance on the task is associated with the model encoding said property. It is common practice to freeze the parameters of the PLM, which serves to prevent the gradients of the probing classifier from back-propagating into the model and thereby altering its pre-trained representation \cite{tenneyWHATYOULEARN2019}. Following \citet{aghazadehMetaphorsPreTrainedLanguage2022}, our experiments are not aimed at improving the accuracy of hyperbole identification tasks; we simply want to check the extent to which hyperbole knowledge may be encoded in the base representations. To that end, we employ edge probing, in which the classifier receives span-level representations from the PLM as inputs after they have been projected to a fixed-dimensional layer, 250 in this case. Thus, we define the span input to the PLM as the minimal lexical unit conveying hyperbolic information as given by the HYPO dataset \cite{troiano2018computational}.

One common criticism of edge probing is that it may not be explanatory in the sense that it does not provide insight into whether a model is learning a linguistic property or simply memorizing the task \cite{belinkovProbingClassifiersPromises2022}. An information-theoretic perspective on addressing this limitation is to combine the probing quality of the classifier with some metric of the effort needed to extract the linguistic knowledge. This approach is known as MDL probing \cite{voitaInformationTheoreticProbingMinimum2020}, wherein effort intuitively refers to the number of steps required by the PLM to encode a compressed representation of the input sequence. Following \citet{aghazadehMetaphorsPreTrainedLanguage2022}, we use the online coding implementation of MDL, which measures a representation's ability to learn from various portions of the data. We report the compression, which is given by $N \cdot \log_2(K)$.
In the context of language modelling, $N$ refers to the size of the dataset, and $K$ is the set of unique sequences being compressed. A random classifier will have a compression of 1, and increased data compression is associated with a better encoding of the given property.

\section{Experiments}

Here we describe our data and setup.

\paragraph{Dataset and annotation.}
We utilize HYPO, a manually constructed English hyperbole dataset \cite{troiano2018computational} of 709 hyperboles with corresponding literal paraphrases, as well as a \textit{minimal units corpus} that provides the contrastive negative (literal) examples for each hyperbole (see examples (1A) and (1B) in \S\ref{sec:introduction}).

For the purpose of our experiment, we first discard the corpus of literal paraphrases as we are interested in contrasting the hyperbolic usage of a particular word or phrase with a literal usage of the same word or phrase. It would otherwise not be possible to construct spans.
To obtain span labels for each hyperbole and its negative contrast sentence, we programmatically extract the positions of each minimal lexical unit and manually adapt the labels as needed; namely, we exclude examples with multiple spans and those without minimal unit contrasts.\footnote{See examples in Appendix~\ref{app:examples}.}
Our final dataset contains 1396 span-labelled hyperbolic and literal sentences, which we split into training (70\%), test (20\%), and development (10\%) sets.

We meticulously annotate the 63 hyperbolic sentences in the development sample using the operative taxonomy outlined in \S\ref{sec:taxonomy}.\footnote{Similar fine-grained annotations were conducted by citet{troiano2018computational}, although they weren't included in the HYPO dataset, and inter-annotator agreement were not measured due to expected degree of difficulty.}
In order to obtain inter-annotator agreement, we enliste the help of additionally 5 annotators, assigning 12-13 sentences to each. As a result, each sentence is annotated twice. We observe a mean Cohen's Kappa of 0.339 (see Figure \ref{fig:kappa}), suggesting only fair agreement, with particular difficulties on the dimension and type spectra on the taxonomy.

\begin{figure}[t] 
  \centering 
  \includegraphics[width=\linewidth]{kappa.jpg} 
  \caption{Inter-annotator agreement for the four aspects.}
  \label{fig:kappa}
  \end{figure}

\paragraph{Experimental setup.}
We conduct edge- and MDL probing experiments for three models, BERT \cite{devlin2018bert}, RoBERTa \cite{liuRoBERTaRobustlyOptimized2019}, and Electra \cite{clark2020electra}.
Following \citet{aghazadehMetaphorsPreTrainedLanguage2022}, all the models are initiated from the base versions of the Huggingface Transformer library \cite{wolf-etal-2020-transformers}, with 12 layers, 768 hidden size, and 110m parameters. In line with the procedure described in detail by by \citet{tenneyWHATYOULEARN2019}, we use the contextual vector representation for each span as inputs to the model, followed by a projection-layer and self-attention pooling to collapse the span vectors down to a fix-length 256-dimensional representation. The edge probing classifier, which in this case is a single linear layer, is then trained on top of the PLM.
We do not change the original hyper-parameters; we keep the batch size of 32 and the learning rate of $5e-5$, and train over 5 epochs for each experiment. During model training, the development set is used to monitor the model's performance and as a stopping criterion at each epoch.
The MDL probe is based on the same structure as the edge probing experiment \cite{aghazadehMetaphorsPreTrainedLanguage2022}. One minor change we make to accommodate the small size of our data is to delete the smallest fraction trained on by the MDL probe, as it would otherwise amount to a single example.
We run our experiments in two configurations: One in which we use the manually labelled hyperbole spans as inputs to the PLM, which follows the classic edge probing procedure. We call this the word-in-context (WiC) representation to emphasize that the model only has access to the rest of the sentence through the context embeddings \cite{tenneyWHATYOULEARN2019}. In the other configuration, which is used as basis for comparison, we feed the entire sentence span to the model - the so-called sentence-level configuration.

\section{Results}
All our results are reported on the test set.

\paragraph{Edge probing results.}
The edge probing classification results are in Table~\ref{tab:edgeproberes} and the classification scores for the hyperboles and the literal sentences are in Table~\ref{tab:metricsperclass}. We only report last layer scores, as we just evaluate the base representations.

% Tables

\begin{table}[t]
\centering\small
\begin{tabular}{lcccc} 
\toprule
& \multicolumn{2}{c}{\small Word-in-Context}& \multicolumn{2}{c}{\small Sentence Level} \\
Experiment & Accuracy & $\mu$-F1 & Accuracy & $\mu$-F1 \\ 
\midrule 
BERT & 0.69 & 0.6895 & 0.72 & 0.7184 \\
RoBERTa & 0.72 & 0.7220 & 0.78 & 0.7762 \\ 
ELECTRA & 0.73 & 0.7256 & 0.78 & 0.7761 \\
\bottomrule 
\end{tabular} 
\caption{Edge probing classification results.} \label{tab:edgeproberes} 
\end{table}

\begin{table}[t] 
\centering\small
\begin{tabular}{lcccc} 
\toprule 
\small Experiment & \small Class & \small Precision & \small Recall & \small F1 \\ 
\midrule 
\multicolumn{5}{l}{Word-in-Context} \\
\multirow{2}{4em}{BERT} & \small literal & 0.70 & 0.66 & 0.68 \\ 
& \small nonliteral & 0.68 & 0.72 & 0.70 \\ 
\multirow{2}{4em}{RoBERTa} & \small literal & 0.73 & 0.71 & 0.72 \\
& \small nonliteral & 0.71 & 0.73 & 0.72 \\ 
\multirow{2}{4em}{Electra} & \small literal & 0.74 & 0.71 & 0.72 \\
& \small nonliteral & 0.72 & 0.74 & 0.73 \\ 

 \midrule 
\multicolumn{5}{l}{Sentence Level} \\
\multirow{2}{4em}{BERT} & \small literal & 0.78 & 0.61 & 0.69 \\ 
 & \small nonliteral & 0.68 & 0.82 & 0.74 \\ 
\multirow{2}{4em}{RoBERTa} & \small literal & 0.80 & 0.74 & 0.77 \\
 & \small nonliteral & 0.75 & 0.82 & 0.78 \\ 
 \multirow{2}{4em}{ELECTRA} & \small literal & 0.84 & 0.69 & 0.76 \\
 & \small nonliteral & 0.73 & 0.87 & 0.79 \\ 

 \bottomrule 
\end{tabular} 
\caption{Performance metrics for each of the models.} 
\label{tab:metricsperclass} 
\end{table}

\paragraph{MDL probing results.}
We report the compression for each of the experiments in Figure~\ref{fig:mdlresults}. The best layer is consistently near the top layer, but not the top layer itself. 

\begin{figure}[t] % should be figure* for readability of the axes, but this can be changed after review when an extra page will be allowed
\centering 
\begin{minipage}[b]{0.48\linewidth} 
\centering 
\includegraphics[width=\linewidth]{bert_wic.jpg} 
\caption*{BERT WiC.}  
\end{minipage} \hfill 
\begin{minipage}[b]{0.48\linewidth} 
\centering 
\includegraphics[width=\linewidth]{bert_sentence.jpg} \caption*{BERT sentence-level.} 
\end{minipage} 
\begin{minipage}[b]{0.48\linewidth} 
\centering 
\includegraphics[width=\linewidth]{roberta_wic.jpg} \caption*{RoBERTa WiC.} 
\end{minipage} \hfill 
\begin{minipage}[b]{0.48\linewidth} 
\centering \includegraphics[width=\linewidth]{roberta_sentence.jpg} 
\caption*{RoBERTa Sentence-level.}
\end{minipage} \hfill
\begin{minipage}[b]{0.48\linewidth} 
  \centering 
  \includegraphics[width=\linewidth]{electra_wic.jpg} 
  \caption*{ELECTRA WiC.}  
  \end{minipage} \hfill
  \begin{minipage}[b]{0.48\linewidth} 
    \centering 
    \includegraphics[width=\linewidth]{electra_sentence.jpg} 
    \caption*{ELECTRA sentence-level.}  
    \end{minipage}
    \caption{Compression for each of the models.} 
\label{fig:mdlresults}
\end{figure}

\paragraph{Error analysis.}
Our error analysis is conducted for the model with the best recall, RoBERTa, and is only conducted for the hyperbolic examples, i.e. the 63 annotated hyperboles in the development set. We choose the best layer based on the compression displayed in Figure~\ref{fig:mdlresults}; i.e. layer 11 for the WiC representation and layer 8 for the sentence-level representation.

Table~\ref{tab:recalls} report the recalls, i.e. the percentages of correctly predicted hyperboles, for each of the annotated categories, for both of our experiments, along with the distributions of each of the annotations on the 63 samples.

\begin{table}[t] 
\centering\scalebox{.8}{
\begin{tabular}{lrrr} 
\toprule 
\small Annotation & {\small WiC} & {\small Sentence} & {\small Total} \\ 
\midrule 
\textsc{QUAL} & 0.784 & 0.865 & 37 \\ 
\textsc{QUANT} & 0.692 & 0.731 & 26 \\ 
\textsc{PDOM} & 0.676 & 0.765 & 34 \\ 
\textsc{SDOM} & 0.828 & 0.862 & 29 \\ 
\textsc{NPOSS} & 0.769 & 0.821 & 39 \\ 
\textsc{POSS} & 0.708 & 0.792 & 24 \\ 
\textsc{CONV} & 0.806 & 0.806 & 36 \\ 
\textsc{NCONV} & 0.667 & 0.815 & 27 \\ 
\bottomrule 
\end{tabular}}
\caption{Recall for word-in-context and sentence-level annotations for each category.} 
\label{tab:recalls} 
\end{table}

\section{Discussion}

We observe notably lower scores than for the metaphor probing experiments across the board: Based on the compression reported for the MDL probes, only reaching up to 1.4 in the best configuration, we can conclude that hyperbolic information does appear to a minor extent to be encoded in PLM representations. This is in line with our expected hypothesis that encoding hyperbole may pose a bigger challenge given its primarily pragmatic nature, and also fits with the fact that PLMs have been reported to struggle with pragmatic inference and commonsense knowledge \cite{rogers-etal-2020-primer}.
Perhaps more interestingly, we can inspect the compression for each of the 12 layers reported in Figure~\ref{fig:mdlresults} to understand where hyperbole is best encoded by the representation, which appears to mostly be in the final layers. This is different from metaphor and may lend further credence to the idea that pragmatics is typically encoded deeper into the PLM. However, since we are employing a very small dataset, the extent to which we can draw definite conclusions is limited. In the future, we would like to extend our experiments to more data and languages to measure generalizability.

Upon analyzing the MDL compressions of the two model representations, we make an intriguing observation that the sentence-level representation consistently outperforms the WiC representation, with compressions reaching up to 1.4 for the top layer. This discovery raises thought-provoking questions about the amount of hyperbole information inferred by the contextual embeddings, as hyperbole often surpasses the token or phrase level. For example, consider the sentence, "The temperature was so low, I saw polar bears wearing jackets." In this case, the entire complement sentence creates the hyperbole. This leads to discussions about defining the lexical unit of hyperboles for corpus collection and annotation purposes \cite{burgers2016hip}.
As for the model representations themselves, while PLMs theoretically encode context in their representation, it is worth exploring how much information is contained within and between subwords in the WiC representation. Employing interpretability metrics could provide further insights into this matter.

Considering the low inter-annotator agreement and that recall seems to generally increase with the frequency of the subcategory in the sample, it is challenging to draw insights from the model error analysis (see Table~\ref{tab:recalls}). However, we may tentatively conclude that the models have an easier time with conventional hyperboles, which is the opposite finding to that of \citet{troiano2018computational} for traditional machine learning pipelines. Similarly surprisingly is it that the PLMs have better recall for domain-switching hyperboles than domain-preserving ones, which may also be confounded by a strength variable.
Furthermore, when manually expecting the false positives, we observe that some sentences predicted to be hyperbolic do indeed contain words and phrases with a potential hyperbolic interpretation, e.g. \textit{paradise} in the sentence ``He thought a place awaited him in paradise'',, suggesting that analyzing hyperbole in a larger context might provide further insights.

Finally, the low inter-annotator agreement, particularly on the dimension and type dichotomies, suggests that the hyperbole categories are not intuitively well-understood or discriminated. During discussions with annotators upon completion of the task, we had several instances where overlap of the dimension subcategories was so large that annotators could argue for either one, and it also wasn't clear to annotators when a semantic domain-switch was present. The latter suggests that more linguistic training may be necessary to identify combined figures of speech in context, for instance, through application of the hyperbole identification procedure (HIP) \cite{burgers2016hip}.
As a consequence, we would like to change our approach to hyperbole annotation in future corpus construction and investigate to which extent these categories are indeed computationally relevant. Our negative findings lend credence to the claim by \citet{biddle2021harnessing} that annotation schemes may present a bottleneck for further development of of the task. 
We would also like to explore approaches for model evaluation of hyperbole types using conceptual knowledge bases and linguistic resources; namely leveraging framenets to explore their utility for metaphorical hyperboles, as well as investigating templates using particular syntactic patterns for evaluating quantitative hyperboles.

\section{Conclusions}

This study has attempted to probe three pre-trained language models (PLMs) for hyperbolic knowledge to better inspect how this information is encoded in their representations. We find, predictably, that knowledge of hyperbole is only to a limited extend encoded by PLMs, and, somewhat more surprisingly, that sentence-level representations appear to be supperior to word-in-context (WiC) representations, which may further highlight that most hyperbolic information does in fact exist beyond the token or phrase level. In the future, we would like to contribute with  more hyperbole data with an operational annotation procedure, extend to cross-lingual experiments, as well as investigate the role of linguistic resources for hyperbole identification.

\bibliography{anthology,references}
\bibliographystyle{acl_natbib}

\appendix

\section{Fine-grained Annotation Examples}\label{app:examples}

Table~\ref{tab:examples} shows example data, along with the spans and annotations (taken from the development set of the data). The annotations are constructed along dimension (\textsc{QUANT}/\textsc{QUAL}), type (\textsc{PDOM}/\textsc{SDOM}), possibility (\textsc{POSS}/\textsc{NPOSS}), and conventionality (\textsc{CONV}/\textsc{NCONV}).

\begin{table*}[t!] 
	\centering \small
	% \resizebox{\linewidth}{!}
	\begin{tabular}{llllll} 
		\toprule 
		\textbf{Hyperbole} & \textbf{Literal} & \textbf{Dim.} & \textbf{Type} & \textbf{Poss.} & \textbf{Conv.} \\ 
		\midrule 
		Marriage is the \emph{grave} of love. & \multirow{2}{.3\linewidth}{I have gone to visit the grave of a friend.} & \textsc{QUAL} & \textsc{SDOM} & \textsc{NPOSS} & \textsc{CONV} \\\\
		\multirow{2}{.3\linewidth}{So much snow that it is like walking in the \emph{firmament}.} & \multirow{2}{.3\linewidth}{Some stars in the firmament have a name.} & \textsc{QUANT} & \textsc{PDOM} & \textsc{NPOSS} & \textsc{NCONV} \\\\
		\multirow{3}{.3\linewidth}{The ancient castle was so big that it took \emph{a week} to walk from one end to the other.} & \multirow{2}{.3\linewidth}{It took a week to walk from one end of the region to the other.} & \textsc{QUANT} & \textsc{PDOM} & \textsc{POSS} & \textsc{CONV} \\\\\\
		His feet are \emph{colder than the arctic}. & \multirow{2}{.3\linewidth}{The Antarctic is colder than the Arctic.} & \textsc{QUANT} & \textsc{PDOM} & \textsc{NPOSS} & \textsc{NCONV} \\\\
		\bottomrule 
	\end{tabular} 
	\caption{Sample data with annotations. Token spans are marked by italics around the word or phrase.}% Dim.: Dimension, Poss.: Possibility, Conv.: Conventionality.} 
\label{tab:examples} 
\end{table*}

\end{document}