diff options
Diffstat (limited to 'power_sequencer/operators/audiosync/mfcc')
-rw-r--r-- | power_sequencer/operators/audiosync/mfcc/__init__.py | 17 | ||||
-rw-r--r-- | power_sequencer/operators/audiosync/mfcc/mfcc.py | 92 | ||||
-rw-r--r-- | power_sequencer/operators/audiosync/mfcc/segment_axis.py | 110 | ||||
-rw-r--r-- | power_sequencer/operators/audiosync/mfcc/trfbank.py | 51 |
4 files changed, 270 insertions, 0 deletions
diff --git a/power_sequencer/operators/audiosync/mfcc/__init__.py b/power_sequencer/operators/audiosync/mfcc/__init__.py new file mode 100644 index 00000000..2b20af43 --- /dev/null +++ b/power_sequencer/operators/audiosync/mfcc/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright (C) 2016-2019 by Nathan Lovato, Daniel Oakey, Razvan Radulescu, and contributors +# +# This file is part of Power Sequencer. +# +# Power Sequencer is free software: you can redistribute it and/or modify it under the terms of the +# GNU General Public License as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Power Sequencer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with Power Sequencer. If +# not, see <https://www.gnu.org/licenses/>. +# +from .mfcc import mfcc diff --git a/power_sequencer/operators/audiosync/mfcc/mfcc.py b/power_sequencer/operators/audiosync/mfcc/mfcc.py new file mode 100644 index 00000000..93709e8c --- /dev/null +++ b/power_sequencer/operators/audiosync/mfcc/mfcc.py @@ -0,0 +1,92 @@ +# +# Copyright (C) 2016-2019 by Nathan Lovato, Daniel Oakey, Razvan Radulescu, and contributors +# +# This file is part of Power Sequencer. +# +# Power Sequencer is free software: you can redistribute it and/or modify it under the terms of the +# GNU General Public License as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Power Sequencer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with Power Sequencer. If +# not, see <https://www.gnu.org/licenses/>. +# +import numpy as np + +from scipy.signal import hamming, lfilter +from scipy.fftpack import fft +from scipy.fftpack.realtransforms import dct + +from .trfbank import trfbank +from .segment_axis import segment_axis + + +def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): + """Compute Mel Frequency Cepstral Coefficients. + + Parameters + ---------- + input: ndarray + input from which the coefficients are computed + + Returns + ------- + ceps: ndarray + Mel-cepstrum coefficients + mspec: ndarray + Log-spectrum in the mel-domain. + + Notes + ----- + MFCC are computed as follows: + * Pre-processing in time-domain (pre-emphasizing) + * Compute the spectrum amplitude by windowing with a Hamming window + * Filter the signal in the spectral domain with a triangular + filter-bank, whose filters are approximatively linearly spaced on the + mel scale, and have equal bandwidth in the mel scale + * Compute the DCT of the log-spectrum + + References + ---------- + .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric + representations for monosyllabic word recognition in continuously + spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. + ASSP-28 (4): 357-366, August 1980.""" + + # MFCC parameters: taken from auditory toolbox + over = nwin - 160 + # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the + # radiation at the lips level) + prefac = 0.97 + + # lowfreq = 400 / 3. + lowfreq = 133.33 + # highfreq = 6855.4976 + linsc = 200 / 3.0 + logsc = 1.0711703 + + nlinfil = 13 + nlogfil = 27 + nfil = nlinfil + nlogfil + + w = hamming(nwin, sym=0) + + fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] + + # ------------------ + # Compute the MFCC + # ------------------ + extract = lfilter([1.0, -prefac], 1, input) + framed = segment_axis(extract, nwin, over) * w + + # Compute the spectrum magnitude + spec = np.abs(fft(framed, nfft, axis=-1)) + # Filter the spectrum through the triangle filterbank + mspec = np.log10(np.dot(spec, fbank.T)) + # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) + ceps = dct(mspec, type=2, norm="ortho", axis=-1)[:, :nceps] + + return ceps, mspec, spec diff --git a/power_sequencer/operators/audiosync/mfcc/segment_axis.py b/power_sequencer/operators/audiosync/mfcc/segment_axis.py new file mode 100644 index 00000000..a8345833 --- /dev/null +++ b/power_sequencer/operators/audiosync/mfcc/segment_axis.py @@ -0,0 +1,110 @@ +# +# Copyright (C) 2016-2019 by Nathan Lovato, Daniel Oakey, Razvan Radulescu, and contributors +# +# This file is part of Power Sequencer. +# +# Power Sequencer is free software: you can redistribute it and/or modify it under the terms of the +# GNU General Public License as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Power Sequencer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with Power Sequencer. If +# not, see <https://www.gnu.org/licenses/>. +# +import numpy as np +import warnings + + +def segment_axis(a, length, overlap=0, axis=None, end="cut", endvalue=0): + """Generate a new array that chops the given array along the given axis + into overlapping frames. + + example: + >>> segment_axis(arange(10), 4, 2) + array([[0, 1, 2, 3], + [2, 3, 4, 5], + [4, 5, 6, 7], + [6, 7, 8, 9]]) + + arguments: + a The array to segment + length The length of each frame + overlap The number of array elements by which the frames should overlap + axis The axis to operate on; if None, act on the flattened array + end What to do with the last frame, if the array is not evenly + divisible into pieces. Options are: + + 'cut' Simply discard the extra values + 'wrap' Copy values from the beginning of the array + 'pad' Pad with a constant value + + endvalue The value to use for end='pad' + + The array is not copied unless necessary (either because it is unevenly + strided and being flattened or because end is set to 'pad' or 'wrap'). + """ + + if axis is None: + a = np.ravel(a) # may copy + axis = 0 + + l = a.shape[axis] + + if overlap >= length: + raise ValueError("frames cannot overlap by more than 100%") + if overlap < 0 or length <= 0: + raise ValueError("overlap must be nonnegative and length must " "be positive") + + if l < length or (l - length) % (length - overlap): + if l > length: + roundup = length + (1 + (l - length) // (length - overlap)) * (length - overlap) + rounddown = length + ((l - length) // (length - overlap)) * (length - overlap) + else: + roundup = length + rounddown = 0 + assert rounddown < l < roundup + assert roundup == rounddown + (length - overlap) or (roundup == length and rounddown == 0) + a = a.swapaxes(-1, axis) + + if end == "cut": + a = a[..., :rounddown] + elif end in ["pad", "wrap"]: # copying will be necessary + s = list(a.shape) + s[-1] = roundup + b = np.empty(s, dtype=a.dtype) + b[..., :l] = a + if end == "pad": + b[..., l:] = endvalue + elif end == "wrap": + b[..., l:] = a[..., : roundup - l] + a = b + + a = a.swapaxes(-1, axis) + + l = a.shape[axis] + if l == 0: + raise ValueError( + "Not enough data points to segment array in 'cut' mode; " "try 'pad' or 'wrap'" + ) + assert l >= length + assert (l - length) % (length - overlap) == 0 + n = 1 + (l - length) // (length - overlap) + s = a.strides[axis] + newshape = a.shape[:axis] + (n, length) + a.shape[axis + 1 :] + newstrides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[axis + 1 :] + + try: + return np.ndarray.__new__( + np.ndarray, strides=newstrides, shape=newshape, buffer=a, dtype=a.dtype + ) + except TypeError: + warnings.warn("Problem with ndarray creation forces copy.") + a = a.copy() + # Shape doesn't change but strides does + newstrides = a.strides[:axis] + ((length - overlap) * s, s) + a.strides[axis + 1 :] + return np.ndarray.__new__( + np.ndarray, strides=newstrides, shape=newshape, buffer=a, dtype=a.dtype + ) diff --git a/power_sequencer/operators/audiosync/mfcc/trfbank.py b/power_sequencer/operators/audiosync/mfcc/trfbank.py new file mode 100644 index 00000000..00558944 --- /dev/null +++ b/power_sequencer/operators/audiosync/mfcc/trfbank.py @@ -0,0 +1,51 @@ +# +# Copyright (C) 2016-2019 by Nathan Lovato, Daniel Oakey, Razvan Radulescu, and contributors +# +# This file is part of Power Sequencer. +# +# Power Sequencer is free software: you can redistribute it and/or modify it under the terms of the +# GNU General Public License as published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Power Sequencer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; +# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with Power Sequencer. If +# not, see <https://www.gnu.org/licenses/>. +# +import numpy as np + + +def trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfilt, nlogfilt): + """Compute triangular filterbank for MFCC computation.""" + # Total number of filters + nfilt = nlinfilt + nlogfilt + + # ------------------------ + # Compute the filter bank + # ------------------------ + # Compute start/middle/end points of the triangular filters in spectral + # domain + freqs = np.zeros(nfilt + 2) + freqs[:nlinfilt] = lowfreq + np.arange(nlinfilt) * linsc + freqs[nlinfilt:] = freqs[nlinfilt - 1] * logsc ** np.arange(1, nlogfilt + 3) + heights = 2.0 / (freqs[2:] - freqs[0:-2]) + + # Compute filterbank coeff (in fft domain, in bins) + fbank = np.zeros((nfilt, nfft)) + # FFT bins (in Hz) + nfreqs = np.arange(nfft) / (1.0 * nfft) * fs + for i in range(nfilt): + low = freqs[i] + cen = freqs[i + 1] + hi = freqs[i + 2] + + lid = np.arange(np.floor(low * nfft / fs) + 1, np.floor(cen * nfft / fs) + 1, dtype=np.int) + lslope = heights[i] / (cen - low) + rid = np.arange(np.floor(cen * nfft / fs) + 1, np.floor(hi * nfft / fs) + 1, dtype=np.int) + rslope = heights[i] / (hi - cen) + fbank[i][lid] = lslope * (nfreqs[lid] - low) + fbank[i][rid] = rslope * (hi - nfreqs[rid]) + + return fbank, freqs |