@inproceedings{burdisso-etal-2024-daic,
title = "{DAIC}-{WOZ}: On the Validity of Using the Therapist{'}s prompts in Automatic Depression Detection from Clinical Interviews",
author = "Burdisso, Sergio and
Reyes-Ram{\'\i}rez, Ernesto and
Villatoro-tello, Esa{\'u} and
S{\'a}nchez-Vega, Fernando and
Lopez Monroy, Adrian and
Motlicek, Petr",
editor = "Naumann, Tristan and
Ben Abacha, Asma and
Bethard, Steven and
Roberts, Kirk and
Bitterman, Danielle",
booktitle = "Proceedings of the 6th Clinical Natural Language Processing Workshop",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.clinicalnlp-1.8",
doi = "10.18653/v1/2024.clinicalnlp-1.8",
pages = "82--90",
abstract = "Automatic depression detection from conversational data has gained significant interest in recent years.The DAIC-WOZ dataset, interviews conducted by a human-controlled virtual agent, has been widely used for this task.Recent studies have reported enhanced performance when incorporating interviewer{'}s prompts into the model.In this work, we hypothesize that this improvement might be mainly due to a bias present in these prompts, rather than the proposed architectures and methods.Through ablation experiments and qualitative analysis, we discover that models using interviewer{'}s prompts learn to focus on a specific region of the interviews, where questions about past experiences with mental health issues are asked, and use them as discriminative shortcuts to detect depressed participants. In contrast, models using participant responses gather evidence from across the entire interview.Finally, to highlight the magnitude of this bias, we achieve a 0.90 F1 score by intentionally exploiting it, the highest result reported to date on this dataset using only textual information.Our findings underline the need for caution when incorporating interviewers{'} prompts into models, as they may inadvertently learn to exploit targeted prompts, rather than learning to characterize the language and behavior that are genuinely indicative of the patient{'}s mental health condition.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="burdisso-etal-2024-daic">
<titleInfo>
<title>DAIC-WOZ: On the Validity of Using the Therapist’s prompts in Automatic Depression Detection from Clinical Interviews</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sergio</namePart>
<namePart type="family">Burdisso</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ernesto</namePart>
<namePart type="family">Reyes-Ramírez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esaú</namePart>
<namePart type="family">Villatoro-tello</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fernando</namePart>
<namePart type="family">Sánchez-Vega</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adrian</namePart>
<namePart type="family">Lopez Monroy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Petr</namePart>
<namePart type="family">Motlicek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Clinical Natural Language Processing Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tristan</namePart>
<namePart type="family">Naumann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asma</namePart>
<namePart type="family">Ben Abacha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Bethard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kirk</namePart>
<namePart type="family">Roberts</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Danielle</namePart>
<namePart type="family">Bitterman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Mexico City, Mexico</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic depression detection from conversational data has gained significant interest in recent years.The DAIC-WOZ dataset, interviews conducted by a human-controlled virtual agent, has been widely used for this task.Recent studies have reported enhanced performance when incorporating interviewer’s prompts into the model.In this work, we hypothesize that this improvement might be mainly due to a bias present in these prompts, rather than the proposed architectures and methods.Through ablation experiments and qualitative analysis, we discover that models using interviewer’s prompts learn to focus on a specific region of the interviews, where questions about past experiences with mental health issues are asked, and use them as discriminative shortcuts to detect depressed participants. In contrast, models using participant responses gather evidence from across the entire interview.Finally, to highlight the magnitude of this bias, we achieve a 0.90 F1 score by intentionally exploiting it, the highest result reported to date on this dataset using only textual information.Our findings underline the need for caution when incorporating interviewers’ prompts into models, as they may inadvertently learn to exploit targeted prompts, rather than learning to characterize the language and behavior that are genuinely indicative of the patient’s mental health condition.</abstract>
<identifier type="citekey">burdisso-etal-2024-daic</identifier>
<identifier type="doi">10.18653/v1/2024.clinicalnlp-1.8</identifier>
<location>
<url>https://aclanthology.org/2024.clinicalnlp-1.8</url>
</location>
<part>
<date>2024-06</date>
<extent unit="page">
<start>82</start>
<end>90</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T DAIC-WOZ: On the Validity of Using the Therapist’s prompts in Automatic Depression Detection from Clinical Interviews
%A Burdisso, Sergio
%A Reyes-Ramírez, Ernesto
%A Villatoro-tello, Esaú
%A Sánchez-Vega, Fernando
%A Lopez Monroy, Adrian
%A Motlicek, Petr
%Y Naumann, Tristan
%Y Ben Abacha, Asma
%Y Bethard, Steven
%Y Roberts, Kirk
%Y Bitterman, Danielle
%S Proceedings of the 6th Clinical Natural Language Processing Workshop
%D 2024
%8 June
%I Association for Computational Linguistics
%C Mexico City, Mexico
%F burdisso-etal-2024-daic
%X Automatic depression detection from conversational data has gained significant interest in recent years.The DAIC-WOZ dataset, interviews conducted by a human-controlled virtual agent, has been widely used for this task.Recent studies have reported enhanced performance when incorporating interviewer’s prompts into the model.In this work, we hypothesize that this improvement might be mainly due to a bias present in these prompts, rather than the proposed architectures and methods.Through ablation experiments and qualitative analysis, we discover that models using interviewer’s prompts learn to focus on a specific region of the interviews, where questions about past experiences with mental health issues are asked, and use them as discriminative shortcuts to detect depressed participants. In contrast, models using participant responses gather evidence from across the entire interview.Finally, to highlight the magnitude of this bias, we achieve a 0.90 F1 score by intentionally exploiting it, the highest result reported to date on this dataset using only textual information.Our findings underline the need for caution when incorporating interviewers’ prompts into models, as they may inadvertently learn to exploit targeted prompts, rather than learning to characterize the language and behavior that are genuinely indicative of the patient’s mental health condition.
%R 10.18653/v1/2024.clinicalnlp-1.8
%U https://aclanthology.org/2024.clinicalnlp-1.8
%U https://doi.org/10.18653/v1/2024.clinicalnlp-1.8
%P 82-90
Markdown (Informal)
[DAIC-WOZ: On the Validity of Using the Therapist’s prompts in Automatic Depression Detection from Clinical Interviews](https://aclanthology.org/2024.clinicalnlp-1.8) (Burdisso et al., ClinicalNLP-WS 2024)
ACL