author = {Wajdi Ghezaiel and Luc Brun and Olivier L{'{e}}zoray}, title = {Wavelet Scattering Transform and {CNN} for Closed Set Speaker Identification}, booktitle = {22nd {IEEE} International Workshop on Multimedia Signal Processing, {MMSP} 2020, Tampere, Finland, September 21-24, 2020}, pages = {1--6}, publisher = {{IEEE}}, year = {2020}, url = {IEEXplore:=https://doi.org/10.1109/MMSP48831.2020.9287061,PDF:=https://brunl01.users.greyc.fr/ARTICLES/Ghezaiel_MMSP2020.pdf,HAL:=https://hal.archives-ouvertes.fr/hal-02955532v1}, timestamp = {Wed, 13 Jan 2021 17:58:38 +0100}, biburl = {https://dblp.org/rec/conf/mmsp/GhezaielBL20.bib}, abstract = "In real world applications, the performances of speaker identification systems degrade due to the reduction of both the amount and the quality of speech utterance. For that particular purpose, we propose a speaker identification system where short utterances with few training examples are used for person identification. Therefore, only a very small amount of data involving a sentence of 2-4 seconds is used. To achieve this, we propose a novel raw waveform end-to-end convolutional neural network (CNN) for text-independent speaker identification. We use wavelet scattering transform as a fixed initialization of the first layers of a CNN network, and learn the remaining layers in a supervised manner. The conducted experiments show that our hybrid architecture combining wavelet scattering transform and CNN can successfully perform efficient feature extraction for a speaker identification, even with a small number of short duration training samples.", theme = "pattern"