@inproceedings{ignatev-etal-2026-dataset,
title = "Dataset Cartography for Implicit Discourse Relation Recognition: Promises and Pitfalls",
author = "Ignatev, Daniil and
Paperno, Denis and
Poesio, Massimo",
editor = "Braud, Chlo{\'e} and
Hardmeier, Christian and
Ogrodniczuk, Maciej and
Loaiciga, Sharid and
Zeldes, Amir and
Nov{\'a}k, Michal and
Li, Chuyuan and
Strube, Michael and
Li, Junyi Jessy",
booktitle = "Proceedings of the 2nd Joint Workshop on Computational Approaches to Discourse, Context and Document-Level Inferences and Computational Models of Reference, Anaphora and Coreference ({CODI}-{CRAC} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.codi-1.8/",
pages = "53--64",
ISBN = "979-8-89176-400-2",
abstract = "Crowdsourced data for implicit discourse relation recognition, IDRR, has been shown to contain both plausible interpretations and noisy annotations. We present a case study of dataset cartography (Swayamdipta 2020) on IDRR-focused DiscoGeM corpus (Scholman et al., 2022). Our findings show that error identification via low confidence proves unreliable, as confidence is strongly affected by label rarity. However, high-confidence datapoints reveal a different use case: auditing the cue-rich regions of the dataset. Our lexical probe demonstrates an association between high confidence items and (mostly temporal) intra-argument cue words. Dataset cartography can thus serve a diagnostic of cue-driven easy-to-learn cases, which need to be balanced out to ensure the robustness of IDRR learning."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ignatev-etal-2026-dataset">
<titleInfo>
<title>Dataset Cartography for Implicit Discourse Relation Recognition: Promises and Pitfalls</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniil</namePart>
<namePart type="family">Ignatev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Denis</namePart>
<namePart type="family">Paperno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Massimo</namePart>
<namePart type="family">Poesio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Joint Workshop on Computational Approaches to Discourse, Context and Document-Level Inferences and Computational Models of Reference, Anaphora and Coreference (CODI-CRAC 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chloé</namePart>
<namePart type="family">Braud</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christian</namePart>
<namePart type="family">Hardmeier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maciej</namePart>
<namePart type="family">Ogrodniczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharid</namePart>
<namePart type="family">Loaiciga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Zeldes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Novák</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuyuan</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Strube</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junyi</namePart>
<namePart type="given">Jessy</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-400-2</identifier>
</relatedItem>
<abstract>Crowdsourced data for implicit discourse relation recognition, IDRR, has been shown to contain both plausible interpretations and noisy annotations. We present a case study of dataset cartography (Swayamdipta 2020) on IDRR-focused DiscoGeM corpus (Scholman et al., 2022). Our findings show that error identification via low confidence proves unreliable, as confidence is strongly affected by label rarity. However, high-confidence datapoints reveal a different use case: auditing the cue-rich regions of the dataset. Our lexical probe demonstrates an association between high confidence items and (mostly temporal) intra-argument cue words. Dataset cartography can thus serve a diagnostic of cue-driven easy-to-learn cases, which need to be balanced out to ensure the robustness of IDRR learning.</abstract>
<identifier type="citekey">ignatev-etal-2026-dataset</identifier>
<location>
<url>https://aclanthology.org/2026.codi-1.8/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>53</start>
<end>64</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Dataset Cartography for Implicit Discourse Relation Recognition: Promises and Pitfalls
%A Ignatev, Daniil
%A Paperno, Denis
%A Poesio, Massimo
%Y Braud, Chloé
%Y Hardmeier, Christian
%Y Ogrodniczuk, Maciej
%Y Loaiciga, Sharid
%Y Zeldes, Amir
%Y Novák, Michal
%Y Li, Chuyuan
%Y Strube, Michael
%Y Li, Junyi Jessy
%S Proceedings of the 2nd Joint Workshop on Computational Approaches to Discourse, Context and Document-Level Inferences and Computational Models of Reference, Anaphora and Coreference (CODI-CRAC 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-400-2
%F ignatev-etal-2026-dataset
%X Crowdsourced data for implicit discourse relation recognition, IDRR, has been shown to contain both plausible interpretations and noisy annotations. We present a case study of dataset cartography (Swayamdipta 2020) on IDRR-focused DiscoGeM corpus (Scholman et al., 2022). Our findings show that error identification via low confidence proves unreliable, as confidence is strongly affected by label rarity. However, high-confidence datapoints reveal a different use case: auditing the cue-rich regions of the dataset. Our lexical probe demonstrates an association between high confidence items and (mostly temporal) intra-argument cue words. Dataset cartography can thus serve a diagnostic of cue-driven easy-to-learn cases, which need to be balanced out to ensure the robustness of IDRR learning.
%U https://aclanthology.org/2026.codi-1.8/
%P 53-64
Markdown (Informal)
[Dataset Cartography for Implicit Discourse Relation Recognition: Promises and Pitfalls](https://aclanthology.org/2026.codi-1.8/) (Ignatev et al., CODI-CRAC 2026)
ACL
- Daniil Ignatev, Denis Paperno, and Massimo Poesio. 2026. Dataset Cartography for Implicit Discourse Relation Recognition: Promises and Pitfalls. In Proceedings of the 2nd Joint Workshop on Computational Approaches to Discourse, Context and Document-Level Inferences and Computational Models of Reference, Anaphora and Coreference (CODI-CRAC 2026), pages 53–64, San Diego, California, USA. Association for Computational Linguistics.