@inproceedings{lee-etal-2023-common,
title = "Common Law Annotations: Investigating the Stability of Dialog System Output Annotations",
author = "Lee, Seunggun and
DeLucia, Alexandra and
Nangia, Nikita and
Ganedi, Praneeth and
Guan, Ryan and
Li, Rubing and
Ngaw, Britney and
Singhal, Aditya and
Vaidya, Shalaka and
Yuan, Zijun and
Zhang, Lining and
Sedoc, Jo{\~a}o",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.780",
doi = "10.18653/v1/2023.findings-acl.780",
pages = "12315--12349",
abstract = "Metrics for Inter-Annotator Agreement (IAA), like Cohen{'}s Kappa, are crucial for validating annotated datasets. Although high agreement is often used to show the reliability of annotation procedures, it is insufficient to ensure or reproducibility. While researchers are encouraged to increase annotator agreement, this can lead to specific and tailored annotation guidelines. We hypothesize that this may result in diverging annotations from different groups. To study this, we first propose the Lee et al. Protocol (LEAP), a standardized and codified annotation protocol. LEAP strictly enforces transparency in the annotation process, which ensures reproducibility of annotation guidelines. Using LEAP to annotate a dialog dataset, we empirically show that while research groups may create reliable guidelines by raising agreement, this can cause divergent annotations across different research groups, thus questioning the validity of the annotations. Therefore, we caution NLP researchers against using reliability as a proxy for reproducibility and validity.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lee-etal-2023-common">
<titleInfo>
<title>Common Law Annotations: Investigating the Stability of Dialog System Output Annotations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Seunggun</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexandra</namePart>
<namePart type="family">DeLucia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikita</namePart>
<namePart type="family">Nangia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Praneeth</namePart>
<namePart type="family">Ganedi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ryan</namePart>
<namePart type="family">Guan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rubing</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Britney</namePart>
<namePart type="family">Ngaw</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aditya</namePart>
<namePart type="family">Singhal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shalaka</namePart>
<namePart type="family">Vaidya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zijun</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lining</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Metrics for Inter-Annotator Agreement (IAA), like Cohen’s Kappa, are crucial for validating annotated datasets. Although high agreement is often used to show the reliability of annotation procedures, it is insufficient to ensure or reproducibility. While researchers are encouraged to increase annotator agreement, this can lead to specific and tailored annotation guidelines. We hypothesize that this may result in diverging annotations from different groups. To study this, we first propose the Lee et al. Protocol (LEAP), a standardized and codified annotation protocol. LEAP strictly enforces transparency in the annotation process, which ensures reproducibility of annotation guidelines. Using LEAP to annotate a dialog dataset, we empirically show that while research groups may create reliable guidelines by raising agreement, this can cause divergent annotations across different research groups, thus questioning the validity of the annotations. Therefore, we caution NLP researchers against using reliability as a proxy for reproducibility and validity.</abstract>
<identifier type="citekey">lee-etal-2023-common</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.780</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.780</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>12315</start>
<end>12349</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Common Law Annotations: Investigating the Stability of Dialog System Output Annotations
%A Lee, Seunggun
%A DeLucia, Alexandra
%A Nangia, Nikita
%A Ganedi, Praneeth
%A Guan, Ryan
%A Li, Rubing
%A Ngaw, Britney
%A Singhal, Aditya
%A Vaidya, Shalaka
%A Yuan, Zijun
%A Zhang, Lining
%A Sedoc, João
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F lee-etal-2023-common
%X Metrics for Inter-Annotator Agreement (IAA), like Cohen’s Kappa, are crucial for validating annotated datasets. Although high agreement is often used to show the reliability of annotation procedures, it is insufficient to ensure or reproducibility. While researchers are encouraged to increase annotator agreement, this can lead to specific and tailored annotation guidelines. We hypothesize that this may result in diverging annotations from different groups. To study this, we first propose the Lee et al. Protocol (LEAP), a standardized and codified annotation protocol. LEAP strictly enforces transparency in the annotation process, which ensures reproducibility of annotation guidelines. Using LEAP to annotate a dialog dataset, we empirically show that while research groups may create reliable guidelines by raising agreement, this can cause divergent annotations across different research groups, thus questioning the validity of the annotations. Therefore, we caution NLP researchers against using reliability as a proxy for reproducibility and validity.
%R 10.18653/v1/2023.findings-acl.780
%U https://aclanthology.org/2023.findings-acl.780
%U https://doi.org/10.18653/v1/2023.findings-acl.780
%P 12315-12349
Markdown (Informal)
[Common Law Annotations: Investigating the Stability of Dialog System Output Annotations](https://aclanthology.org/2023.findings-acl.780) (Lee et al., Findings 2023)
ACL
- Seunggun Lee, Alexandra DeLucia, Nikita Nangia, Praneeth Ganedi, Ryan Guan, Rubing Li, Britney Ngaw, Aditya Singhal, Shalaka Vaidya, Zijun Yuan, Lining Zhang, and João Sedoc. 2023. Common Law Annotations: Investigating the Stability of Dialog System Output Annotations. In Findings of the Association for Computational Linguistics: ACL 2023, pages 12315–12349, Toronto, Canada. Association for Computational Linguistics.