# # Copyright (C) 2016-2019 by Nathan Lovato, Daniel Oakey, Razvan Radulescu, and contributors # # This file is part of Power Sequencer. # # Power Sequencer is free software: you can redistribute it and/or modify it under the terms of the # GNU General Public License as published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # Power Sequencer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; # without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with Power Sequencer. If # not, see . # import numpy as np from scipy.signal import hamming, lfilter from scipy.fftpack import fft from scipy.fftpack.realtransforms import dct from .trfbank import trfbank from .segment_axis import segment_axis def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwidth in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" # MFCC parameters: taken from auditory toolbox over = nwin - 160 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 # lowfreq = 400 / 3. lowfreq = 133.33 # highfreq = 6855.4976 linsc = 200 / 3.0 logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] # ------------------ # Compute the MFCC # ------------------ extract = lfilter([1.0, -prefac], 1, input) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm="ortho", axis=-1)[:, :nceps] return ceps, mspec, spec