@inproceedings{riad-etal-2022-comparison,
title = "A comparison study on patient-psychologist voice diarization",
author = "Riad, Rachid and
Titeux, Hadrien and
Lemoine, Laurie and
Montillot, Justine and
Sliwinski, Agnes and
Bagnou, Jennifer and
Cao, Xuan and
Bachoud-Levi, Anne-Catherine and
Dupoux, Emmanuel",
editor = "Ebling, Sarah and
Prud{'}hommeaux, Emily and
Vaidyanathan, Preethi",
booktitle = "Ninth Workshop on Speech and Language Processing for Assistive Technologies (SLPAT-2022)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.slpat-1.4",
doi = "10.18653/v1/2022.slpat-1.4",
pages = "30--36",
abstract = "Conversations between a clinician and a patient, in natural conditions, are valuable sources of information for medical follow-up. The automatic analysis of these dialogues could help extract new language markers and speed up the clinicians{'} reports. Yet, it is not clear which model is the most efficient to detect and identify the speaker turns, especially for individuals with speech disorders. Here, we proposed a split of the data that allows conducting a comparative evaluation of different diarization methods. We designed and trained end-to-end neural network architectures to directly tackle this task from the raw signal and evaluate each approach under the same metric. We also studied the effect of fine-tuning models to find the best performance. Experimental results are reported on naturalistic clinical conversations between Psychologists and Interviewees, at different stages of Huntington{'}s disease, displaying a large panel of speech disorders. We found out that our best end-to-end model achieved 19.5 {\%} IER on the test set, compared to 23.6{\%} achieved by the finetuning of the X-vector architecture. Finally, we observed that we could extract clinical markers directly from the automatic systems, highlighting the clinical relevance of our methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="riad-etal-2022-comparison">
<titleInfo>
<title>A comparison study on patient-psychologist voice diarization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rachid</namePart>
<namePart type="family">Riad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hadrien</namePart>
<namePart type="family">Titeux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurie</namePart>
<namePart type="family">Lemoine</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Justine</namePart>
<namePart type="family">Montillot</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Agnes</namePart>
<namePart type="family">Sliwinski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jennifer</namePart>
<namePart type="family">Bagnou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuan</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anne-Catherine</namePart>
<namePart type="family">Bachoud-Levi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emmanuel</namePart>
<namePart type="family">Dupoux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Ninth Workshop on Speech and Language Processing for Assistive Technologies (SLPAT-2022)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Ebling</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="family">Prud’hommeaux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Vaidyanathan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Conversations between a clinician and a patient, in natural conditions, are valuable sources of information for medical follow-up. The automatic analysis of these dialogues could help extract new language markers and speed up the clinicians’ reports. Yet, it is not clear which model is the most efficient to detect and identify the speaker turns, especially for individuals with speech disorders. Here, we proposed a split of the data that allows conducting a comparative evaluation of different diarization methods. We designed and trained end-to-end neural network architectures to directly tackle this task from the raw signal and evaluate each approach under the same metric. We also studied the effect of fine-tuning models to find the best performance. Experimental results are reported on naturalistic clinical conversations between Psychologists and Interviewees, at different stages of Huntington’s disease, displaying a large panel of speech disorders. We found out that our best end-to-end model achieved 19.5 % IER on the test set, compared to 23.6% achieved by the finetuning of the X-vector architecture. Finally, we observed that we could extract clinical markers directly from the automatic systems, highlighting the clinical relevance of our methods.</abstract>
<identifier type="citekey">riad-etal-2022-comparison</identifier>
<identifier type="doi">10.18653/v1/2022.slpat-1.4</identifier>
<location>
<url>https://aclanthology.org/2022.slpat-1.4</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>30</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A comparison study on patient-psychologist voice diarization
%A Riad, Rachid
%A Titeux, Hadrien
%A Lemoine, Laurie
%A Montillot, Justine
%A Sliwinski, Agnes
%A Bagnou, Jennifer
%A Cao, Xuan
%A Bachoud-Levi, Anne-Catherine
%A Dupoux, Emmanuel
%Y Ebling, Sarah
%Y Prud’hommeaux, Emily
%Y Vaidyanathan, Preethi
%S Ninth Workshop on Speech and Language Processing for Assistive Technologies (SLPAT-2022)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F riad-etal-2022-comparison
%X Conversations between a clinician and a patient, in natural conditions, are valuable sources of information for medical follow-up. The automatic analysis of these dialogues could help extract new language markers and speed up the clinicians’ reports. Yet, it is not clear which model is the most efficient to detect and identify the speaker turns, especially for individuals with speech disorders. Here, we proposed a split of the data that allows conducting a comparative evaluation of different diarization methods. We designed and trained end-to-end neural network architectures to directly tackle this task from the raw signal and evaluate each approach under the same metric. We also studied the effect of fine-tuning models to find the best performance. Experimental results are reported on naturalistic clinical conversations between Psychologists and Interviewees, at different stages of Huntington’s disease, displaying a large panel of speech disorders. We found out that our best end-to-end model achieved 19.5 % IER on the test set, compared to 23.6% achieved by the finetuning of the X-vector architecture. Finally, we observed that we could extract clinical markers directly from the automatic systems, highlighting the clinical relevance of our methods.
%R 10.18653/v1/2022.slpat-1.4
%U https://aclanthology.org/2022.slpat-1.4
%U https://doi.org/10.18653/v1/2022.slpat-1.4
%P 30-36
Markdown (Informal)
[A comparison study on patient-psychologist voice diarization](https://aclanthology.org/2022.slpat-1.4) (Riad et al., SLPAT 2022)
ACL
- Rachid Riad, Hadrien Titeux, Laurie Lemoine, Justine Montillot, Agnes Sliwinski, Jennifer Bagnou, Xuan Cao, Anne-Catherine Bachoud-Levi, and Emmanuel Dupoux. 2022. A comparison study on patient-psychologist voice diarization. In Ninth Workshop on Speech and Language Processing for Assistive Technologies (SLPAT-2022), pages 30–36, Dublin, Ireland. Association for Computational Linguistics.