Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKrishna Subramani <subramani.krishna97@gmail.com>2023-09-25 07:19:41 +0300
committerJean-Marc Valin <jmvalin@amazon.com>2023-09-26 19:12:47 +0300
commitf38b4a317f2c5f1fd2d40668fe7be48099359d66 (patch)
tree6f129a07ff20e7505b726e3c44413f77f867427a
parentd88dd893584d5a97fe9350b323394bdc549f4367 (diff)
Python code for neural pitch
-rw-r--r--dnn/torch/neural-pitch/README.md18
-rw-r--r--dnn/torch/neural-pitch/data_augmentation.py149
-rw-r--r--dnn/torch/neural-pitch/download_demand.sh43
-rw-r--r--dnn/torch/neural-pitch/evaluation.py464
-rw-r--r--dnn/torch/neural-pitch/experiments.py38
-rw-r--r--dnn/torch/neural-pitch/export_neuralpitch_weights.py89
-rw-r--r--dnn/torch/neural-pitch/models.py218
-rw-r--r--dnn/torch/neural-pitch/neural_pitch_update.py207
-rw-r--r--dnn/torch/neural-pitch/ptdb_process.sh34
-rw-r--r--dnn/torch/neural-pitch/training.py162
-rw-r--r--dnn/torch/neural-pitch/utils.py59
11 files changed, 1481 insertions, 0 deletions
diff --git a/dnn/torch/neural-pitch/README.md b/dnn/torch/neural-pitch/README.md
new file mode 100644
index 00000000..6323ead5
--- /dev/null
+++ b/dnn/torch/neural-pitch/README.md
@@ -0,0 +1,18 @@
+## Neural Pitch Estimation
+
+- Dataset Installation
+ 1. Download and unzip PTDB Dataset:
+ wget https://www2.spsc.tugraz.at/databases/PTDB-TUG/SPEECH_DATA_ZIPPED.zip
+ unzip SPEECH_DATA_ZIPPED.zip
+
+ 2. Inside "SPEECH DATA" above, run ptdb_process.sh to combine male/female
+
+ 3. To Download and combine demand, simply run download_demand.sh
+
+- LPCNet preparation
+ 1. To extract xcorr, add lpcnet_extractor.c and add relevant functions to lpcnet_enc.c, add source for headers/c files and Makefile.am, and compile to generate ./lpcnet_xcorr_extractor object
+
+- Dataset Augmentation and training (check out arguments to each of the following)
+ 1. Run data_augmentation.py
+ 2. Run training.py using augmented data
+ 3. Run experiments.py
diff --git a/dnn/torch/neural-pitch/data_augmentation.py b/dnn/torch/neural-pitch/data_augmentation.py
new file mode 100644
index 00000000..ee7a3cab
--- /dev/null
+++ b/dnn/torch/neural-pitch/data_augmentation.py
@@ -0,0 +1,149 @@
+"""
+Perform Data Augmentation (Gain, Additive Noise, Random Filtering) on Input TTS Data
+1. Read in chunks and compute clean pitch first
+2. Then add in augmentation (Noise/Level/Response)
+ - Adds filtered noise from the "Demand" dataset, https://zenodo.org/record/1227121#.XRKKxYhKiUk
+ - When using the Demand Dataset, consider each channel as a possible noise input, and keep the first 4 minutes of noise for training
+3. Use this "augmented" audio for feature computation, and compute pitch using CREPE on the clean input
+
+Notes: To ensure consistency with the discovered CREPE offset, we do the following
+- We pad the input audio to the zero-centered CREPE estimator with 80 zeros
+- We pad the input audio to our feature computation with 160 zeros to center them
+"""
+
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('data', type=str, help='input raw audio data')
+parser.add_argument('output', type=str, help='output directory')
+parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
+parser.add_argument('noise_dataset', type=str, help='Location of the Demand Datset')
+parser.add_argument('--flag_xcorr', type=bool, help='Flag to additionally dump xcorr features',choices=[True,False],default = False,required = False)
+parser.add_argument('--fraction_input_use', type=float, help='Fraction of input data to consider',default = 0.3,required = False)
+parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+parser.add_argument('--choice_augment', type=str, help='Choice of noise augmentation, either use additive synthetic noise or add noise from the demand dataset',choices = ['demand','synthetic'],default = "demand",required = False)
+parser.add_argument('--fraction_clean', type=float, help='Fraction of data to keep clean (that is not augment with anything)',default = 0.2,required = False)
+parser.add_argument('--chunk_size', type=int, help='Number of samples to augment with for each iteration',default = 80000,required = False)
+parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
+parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
+parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
+
+args = parser.parse_args()
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+from utils import stft, random_filter
+
+import numpy as np
+import tqdm
+import crepe
+import random
+import glob
+import subprocess
+
+data_full = np.memmap(args.data, dtype=np.int16,mode = 'r')
+data = data_full[:(int)(args.fraction_input_use*data_full.shape[0])]
+
+# list_features = []
+list_cents = []
+list_confidences = []
+
+N = args.N
+H = args.H
+freq_keep = args.freq_keep
+# Minimum/Maximum periods, decided by LPCNet
+min_period = 32
+max_period = 256
+f_ref = 16000/max_period
+chunk_size = args.chunk_size
+num_frames_chunk = chunk_size//H
+list_indices_keep = np.concatenate([np.arange(freq_keep), (N//2 + 1) + np.arange(freq_keep), 2*(N//2 + 1) + np.arange(freq_keep)])
+
+output_IF = np.memmap(args.output + '_iffeat.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,list_indices_keep.shape[0]), mode='w+')
+if args.flag_xcorr:
+ output_xcorr = np.memmap(args.output + '_xcorr.f32', dtype=np.float32, shape=(((data.shape[0]//chunk_size - 1)//1)*num_frames_chunk,257), mode='w+')
+
+fraction_clean = args.fraction_clean
+
+noise_dataset = args.noise_dataset
+
+for i in tqdm.trange((data.shape[0]//chunk_size - 1)//1):
+ chunk = data[i*chunk_size:(i + 1)*chunk_size]/(2**15 - 1)
+
+ # Clean Pitch/Confidence Estimate
+ # Padding input to CREPE by 80 samples to ensure it aligns
+ _, pitch, confidence, _ = crepe.predict(np.concatenate([np.zeros(80),chunk]), 16000, center=True, viterbi=True,verbose=0)
+ cent = 1200*np.log2(np.divide(pitch, f_ref, out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)
+
+ # Filter out of range pitches/confidences
+ confidence[pitch < 16000/max_period] = 0
+ confidence[pitch > 16000/min_period] = 0
+
+ # Keep fraction of data clean, augment only 1 minus the fraction
+ if (np.random.rand() > fraction_clean):
+ # Response, generate controlled/random 2nd order IIR filter and filter chunk
+ chunk = random_filter(chunk)
+
+ # Level/Gain response {scale by random gain between 1.0e-3 and 10}
+ # Generate random gain in dB and then convert to scale
+ g_dB = np.random.uniform(low = -60, high = 20, size = 1)
+ # g_dB = 0
+ g = 10**(g_dB/20)
+
+ # Noise Addition {Add random SNR 2nd order randomly colored noise}
+ # Generate noise SNR value and add corresponding noise
+ snr_dB = np.random.uniform(low = -20, high = 30, size = 1)
+
+ if args.choice_augment == 'synthetic':
+ n = np.random.randn(chunk_size)
+ else:
+ list_noisefiles = noise_dataset + '*.wav'
+ noise_file = random.choice(glob.glob(list_noisefiles))
+ n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
+ rand_range = np.random.randint(low = 0, high = (n.shape[0] - 16000*60 - chunk.shape[0])) # 16000 is subtracted because we will use the last 1 minutes of noise for testing
+ n = n[rand_range:rand_range + chunk.shape[0]]
+
+ # Randomly filter the sampled noise as well
+ n = random_filter(n)
+ # generate random prime number between 0,500 and make those samples of noise 0 (to prevent GRU from picking up temporal patterns)
+ Nprime = random.choice([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541])
+ n[chunk_size - Nprime:] = np.zeros(Nprime)
+ snr_multiplier = np.sqrt((np.sum(np.abs(chunk)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
+
+ chunk = g*(chunk + snr_multiplier*n)
+
+ # Zero pad input audio by 160 to center the frames
+ spec = stft(x = np.concatenate([np.zeros(160),chunk]), w = 'boxcar', N = N, H = H).T
+ phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+ phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+ feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+ feature = feature[:,list_indices_keep]
+
+ if args.flag_xcorr:
+ # Dump noisy audio into temp file
+ data_temp = np.memmap('./temp_augment.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
+ # data_temp[:chunk.shape[0]] = (chunk/(np.max(np.abs(chunk)))*(2**15 - 1)).astype(np.int16)
+ data_temp[:chunk.shape[0]] = ((chunk)*(2**15 - 1)).astype(np.int16)
+
+ subprocess.run([args.path_lpcnet_extractor, './temp_augment.raw', './temp_augment_xcorr.f32'])
+ feature_xcorr = np.flip(np.fromfile('./temp_augment_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+ ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+ feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+
+ os.remove('./temp_augment.raw')
+ os.remove('./temp_augment_xcorr.f32')
+ num_frames = min(cent.shape[0],feature.shape[0],feature_xcorr.shape[0],num_frames_chunk)
+ feature = feature[:num_frames,:]
+ cent = cent[:num_frames]
+ confidence = confidence[:num_frames]
+ feature_xcorr = feature_xcorr[:num_frames]
+ output_IF[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature
+ output_xcorr[i*num_frames_chunk:(i + 1)*num_frames_chunk,:] = feature_xcorr
+ list_cents.append(cent)
+ list_confidences.append(confidence)
+
+list_cents = np.hstack(list_cents)
+list_confidences = np.hstack(list_confidences)
+
+np.save(args.output + '_pitches',np.vstack([list_cents,list_confidences]))
diff --git a/dnn/torch/neural-pitch/download_demand.sh b/dnn/torch/neural-pitch/download_demand.sh
new file mode 100644
index 00000000..0cff06af
--- /dev/null
+++ b/dnn/torch/neural-pitch/download_demand.sh
@@ -0,0 +1,43 @@
+wget https://zenodo.org/record/1227121/files/DKITCHEN_16k.zip
+
+wget https://zenodo.org/record/1227121/files/DLIVING_16k.zip
+
+wget https://zenodo.org/record/1227121/files/DWASHING_16k.zip
+
+wget https://zenodo.org/record/1227121/files/NFIELD_16k.zip
+
+wget https://zenodo.org/record/1227121/files/NPARK_16k.zip
+
+wget https://zenodo.org/record/1227121/files/NRIVER_16k.zip
+
+wget https://zenodo.org/record/1227121/files/OHALLWAY_16k.zip
+
+wget https://zenodo.org/record/1227121/files/OMEETING_16k.zip
+
+wget https://zenodo.org/record/1227121/files/OOFFICE_16k.zip
+
+wget https://zenodo.org/record/1227121/files/PCAFETER_16k.zip
+
+wget https://zenodo.org/record/1227121/files/PRESTO_16k.zip
+
+wget https://zenodo.org/record/1227121/files/PSTATION_16k.zip
+
+wget https://zenodo.org/record/1227121/files/TMETRO_16k.zip
+
+wget https://zenodo.org/record/1227121/files/TCAR_16k.zip
+
+wget https://zenodo.org/record/1227121/files/TBUS_16k.zip
+
+wget https://zenodo.org/record/1227121/files/STRAFFIC_16k.zip
+
+wget https://zenodo.org/record/1227121/files/SPSQUARE_16k.zip
+
+unzip '*.zip'
+
+mkdir -p ./combined_demand_channels/
+for file in */*.wav; do
+parentdir="$(dirname "$file")"
+echo $parentdir
+fname="$(basename "$file")"
+cp $file ./combined_demand_channels/$parentdir+$fname
+done
diff --git a/dnn/torch/neural-pitch/evaluation.py b/dnn/torch/neural-pitch/evaluation.py
new file mode 100644
index 00000000..0369cafa
--- /dev/null
+++ b/dnn/torch/neural-pitch/evaluation.py
@@ -0,0 +1,464 @@
+"""
+Evaluation script to compute the Raw Pitch Accuracy
+Procedure:
+ - Look at all voiced frames in file
+ - Compute number of pitches in those frames that lie within a 50 cent threshold
+ RPA = (Total number of pitches within threshold summed across all files)/(Total number of voiced frames summed accross all files)
+"""
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+from prettytable import PrettyTable
+import numpy as np
+import glob
+import random
+import tqdm
+import torch
+import librosa
+import json
+from utils import stft, random_filter, feature_xform
+import subprocess
+import crepe
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def rca(reference,input,voicing,thresh = 25):
+ idx_voiced = np.where(voicing != 0)[0]
+ acc = np.where(np.abs(reference - input)[idx_voiced] < thresh)[0]
+ return acc.shape[0]
+
+def sweep_rca(reference,input,voicing,thresh = 25,ind_arr = np.arange(-10,10)):
+ l = []
+ for i in ind_arr:
+ l.append(rca(reference,np.roll(input,i),voicing,thresh))
+ l = np.array(l)
+
+ return np.max(l)
+
+def rpa(model,device = 'cpu',data_format = 'if'):
+ list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
+ dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
+ # random_shuffle = list(np.random.permutation(len(list_files)))
+ random.shuffle(list_files)
+ list_files = list_files[:1000]
+
+ # C_lp = 0
+ # C_lp_m = 0
+ # C_lp_f = 0
+ # list_rca_model_lp = []
+ # list_rca_male_lp = []
+ # list_rca_female_lp = []
+
+ # C_hp = 0
+ # C_hp_m = 0
+ # C_hp_f = 0
+ # list_rca_model_hp = []
+ # list_rca_male_hp = []
+ # list_rca_female_hp = []
+
+ C_all = 0
+ C_all_m = 0
+ C_all_f = 0
+ list_rca_model_all = []
+ list_rca_male_all = []
+ list_rca_female_all = []
+
+ thresh = 50
+ N = 320
+ H = 160
+ freq_keep = 30
+
+ for idx in tqdm.trange(len(list_files)):
+ audio_file = list_files[idx]
+ file_name = os.path.basename(list_files[idx])[:-4]
+
+ audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
+ offset = 432
+ audio = audio[offset:]
+ rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
+
+ spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
+ phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+ phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+ idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
+ feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+ feature_if = feature[:,idx_save]
+
+ data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
+ data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
+
+ subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
+ feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+ ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+ feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+ # feature_xcorr = feature_xform(feature_xcorr)
+
+ os.remove('./temp.raw')
+ os.remove('./temp_xcorr.f32')
+
+ if data_format == 'if':
+ feature = feature_if
+ elif data_format == 'xcorr':
+ feature = feature_xcorr
+ else:
+ indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
+ feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
+
+
+ pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
+ pitch = np.loadtxt(pitch_file_name)[:,0]
+ voicing = np.loadtxt(pitch_file_name)[:,1]
+ indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
+ pitch = pitch[:indmin]
+ voicing = voicing[:indmin]
+ rmse = rmse[:indmin]
+ voicing = voicing*(rmse > 0.05*np.max(rmse))
+ if "mic_F" in audio_file:
+ idx_correct = np.where(pitch < 125)
+ voicing[idx_correct] = 0
+
+ cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
+
+ # if (model == 'penn'):
+ # model_frequency, _ = penn.from_audio(
+ # torch.from_numpy(audio).unsqueeze(0).float(),
+ # 16000,
+ # hopsize=0.01,
+ # fmin=(16000.0/256),
+ # fmax=500,
+ # checkpoint=penn.DEFAULT_CHECKPOINT,
+ # batch_size=32,
+ # pad=True,
+ # interp_unvoiced_at=0.065,
+ # gpu=0)
+ # model_frequency = model_frequency.cpu().detach().squeeze().numpy()
+ # model_cents = 1200*np.log2(model_frequency/(16000/256))
+
+ # elif (model == 'crepe'):
+ # _, model_frequency, _, _ = crepe.predict(audio, 16000, viterbi=vflag,center=True,verbose=0)
+ # lpcnet_file_name = '/home/ubuntu/Code/Datasets/SPEECH_DATA/lpcnet_f0_16k_residual/' + file_name + '_f0.f32'
+ # period_lpcnet = np.fromfile(lpcnet_file_name, dtype='float32')
+ # model_frequency = 16000/(period_lpcnet + 1.0e-6)
+ # model_cents = 1200*np.log2(model_frequency/(16000/256))
+ # else:
+ model_cents = model(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
+ model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
+ # model_cents = np.roll(model_cents,-1*3)
+
+ num_frames = min(cent.shape[0],model_cents.shape[0])
+ pitch = pitch[:num_frames]
+ cent = cent[:num_frames]
+ voicing = voicing[:num_frames]
+ model_cents = model_cents[:num_frames]
+
+ voicing_all = np.copy(voicing)
+ # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
+ force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
+ voicing_all[force_out_of_pitch] = 0
+ C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
+
+ # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
+ list_rca_model_all.append(rca(cent,model_cents,voicing_all,thresh))
+ # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
+
+ if "mic_M" in audio_file:
+ # list_rca_male_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
+ list_rca_male_all.append(rca(cent,model_cents,voicing_all,thresh))
+ C_all_m = C_all_m + np.where(voicing_all != 0)[0].shape[0]
+ else:
+ # list_rca_female_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
+ list_rca_female_all.append(rca(cent,model_cents,voicing_all,thresh))
+ C_all_f = C_all_f + np.where(voicing_all != 0)[0].shape[0]
+
+ """
+ # Low pitch estimation
+ voicing_lp = np.copy(voicing)
+ force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 125)==True)
+ voicing_lp[force_out_of_pitch] = 0
+ C_lp = C_lp + np.where(voicing_lp != 0)[0].shape[0]
+
+ # list_rca_model_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
+ list_rca_model_lp.append(rca(cent,model_cents,voicing_lp,thresh))
+
+ if "mic_M" in audio_file:
+ # list_rca_male_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
+ list_rca_male_lp.append(rca(cent,model_cents,voicing_lp,thresh))
+ C_lp_m = C_lp_m + np.where(voicing_lp != 0)[0].shape[0]
+ else:
+ # list_rca_female_lp.append(sweep_rca(cent,model_cents,voicing_lp,thresh,[0]))
+ list_rca_female_lp.append(rca(cent,model_cents,voicing_lp,thresh))
+ C_lp_f = C_lp_f + np.where(voicing_lp != 0)[0].shape[0]
+
+ # High pitch estimation
+ voicing_hp = np.copy(voicing)
+ force_out_of_pitch = np.where(np.logical_or(pitch < 125,pitch > 500)==True)
+ voicing_hp[force_out_of_pitch] = 0
+ C_hp = C_hp + np.where(voicing_hp != 0)[0].shape[0]
+
+ # list_rca_model_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
+ list_rca_model_hp.append(rca(cent,model_cents,voicing_hp,thresh))
+
+ if "mic_M" in audio_file:
+ # list_rca_male_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
+ list_rca_male_hp.append(rca(cent,model_cents,voicing_hp,thresh))
+ C_hp_m = C_hp_m + np.where(voicing_hp != 0)[0].shape[0]
+ else:
+ # list_rca_female_hp.append(sweep_rca(cent,model_cents,voicing_hp,thresh,[0]))
+ list_rca_female_hp.append(rca(cent,model_cents,voicing_hp,thresh))
+ C_hp_f = C_hp_f + np.where(voicing_hp != 0)[0].shape[0]
+ # list_rca_model.append(acc_model)
+ # list_rca_crepe.append(acc_crepe)
+ # list_rca_lpcnet.append(acc_lpcnet)
+ # list_rca_penn.append(acc_penn)
+ """
+
+ # list_rca_crepe = np.array(list_rca_crepe)
+ # list_rca_model_lp = np.array(list_rca_model_lp)
+ # list_rca_male_lp = np.array(list_rca_male_lp)
+ # list_rca_female_lp = np.array(list_rca_female_lp)
+
+ # list_rca_model_hp = np.array(list_rca_model_hp)
+ # list_rca_male_hp = np.array(list_rca_male_hp)
+ # list_rca_female_hp = np.array(list_rca_female_hp)
+
+ list_rca_model_all = np.array(list_rca_model_all)
+ list_rca_male_all = np.array(list_rca_male_all)
+ list_rca_female_all = np.array(list_rca_female_all)
+ # list_rca_lpcnet = np.array(list_rca_lpcnet)
+ # list_rca_penn = np.array(list_rca_penn)
+
+ x = PrettyTable()
+
+ x.field_names = ["Experiment", "Mean RPA"]
+ x.add_row(["Both all pitches", np.sum(list_rca_model_all)/C_all])
+ # x.add_row(["Both low pitches", np.sum(list_rca_model_lp)/C_lp])
+ # x.add_row(["Both high pitches", np.sum(list_rca_model_hp)/C_hp])
+
+ x.add_row(["Male all pitches", np.sum(list_rca_male_all)/C_all_m])
+ # x.add_row(["Male low pitches", np.sum(list_rca_male_lp)/C_lp_m])
+ # x.add_row(["Male high pitches", np.sum(list_rca_male_hp)/C_hp_m])
+
+ x.add_row(["Female all pitches", np.sum(list_rca_female_all)/C_all_f])
+ # x.add_row(["Female low pitches", np.sum(list_rca_female_lp)/C_lp_f])
+ # x.add_row(["Female high pitches", np.sum(list_rca_female_hp)/C_hp_f])
+
+ print(x)
+
+ return None
+
+def cycle_eval(list_files_pth, noise_type = 'synthetic', noise_dataset = None, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = None,fraction = 0.1,thresh = 50):
+ """
+ Cycle through SNR evaluation for list of .pth files
+ """
+ # list_files = glob.glob('/home/ubuntu/Code/Datasets/SPEECH DATA/combined_mic_16k_raw/*.raw')
+ # dir_f0 = '/home/ubuntu/Code/Datasets/SPEECH DATA/combine_f0_ptdb/'
+ # random_shuffle = list(np.random.permutation(len(list_files)))
+ list_files = glob.glob(ptdb_dataset_path + 'combined_mic_16k/*.raw')
+ dir_f0 = ptdb_dataset_path + 'combined_reference_f0/'
+ random.shuffle(list_files)
+ list_files = list_files[:(int)(fraction*len(list_files))]
+
+ # list_nfiles = ['DKITCHEN','NFIELD','OHALLWAY','PCAFETER','SPSQUARE','TCAR','DLIVING','NPARK','OMEETING','PRESTO','STRAFFIC','TMETRO','DWASHING','NRIVER','OOFFICE','PSTATION','TBUS']
+
+ dict_models = {}
+ list_snr.append(np.inf)
+ # thresh = 50
+
+ for f in list_files_pth:
+ if (f!='crepe') and (f!='lpcnet'):
+ fname = os.path.basename(f).split('_')[0] + '_' + os.path.basename(f).split('_')[-1][:-4]
+ config_path = os.path.dirname(f) + '/' + os.path.basename(f).split('_')[0] + '_' + 'config_' + os.path.basename(f).split('_')[-1][:-4] + '.json'
+ with open(config_path) as json_file:
+ dict_params = json.load(json_file)
+
+ if dict_params['data_format'] == 'if':
+ from models import large_if_ccode as model
+ pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
+ elif dict_params['data_format'] == 'xcorr':
+ from models import large_xcorr as model
+ pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
+ else:
+ from models import large_joint as model
+ pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
+
+ pitch_nn.load_state_dict(torch.load(f))
+
+ N = dict_params['window_size']
+ H = dict_params['hop_factor']
+ freq_keep = dict_params['freq_keep']
+
+ list_mean = []
+ list_std = []
+ for snr_dB in list_snr:
+ C_all = 0
+ C_correct = 0
+ for idx in tqdm.trange(len(list_files)):
+ audio_file = list_files[idx]
+ file_name = os.path.basename(list_files[idx])[:-4]
+
+ audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
+ offset = 432
+ audio = audio[offset:]
+ rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = N,hop_length = H))
+
+ if noise_type != 'synthetic':
+ list_noisefiles = noise_dataset + '*.wav'
+ noise_file = random.choice(glob.glob(list_noisefiles))
+ n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
+ rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
+ n = n[rand_range:rand_range + audio.shape[0]]
+ else:
+ n = np.random.randn(audio.shape[0])
+ n = random_filter(n)
+
+ snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
+ audio = audio + snr_multiplier*n
+
+ spec = stft(x = np.concatenate([np.zeros(160),audio]), w = 'boxcar', N = N, H = H).T
+ phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+ phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+ idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
+ feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+ feature_if = feature[:,idx_save]
+
+ data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
+ # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
+ data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
+
+ subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32'])
+ feature_xcorr = np.flip(np.fromfile('./temp_xcorr.f32', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+ ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+ feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+
+ os.remove('./temp.raw')
+ os.remove('./temp_xcorr.f32')
+
+ if dict_params['data_format'] == 'if':
+ feature = feature_if
+ elif dict_params['data_format'] == 'xcorr':
+ feature = feature_xcorr
+ else:
+ indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
+ feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
+
+ pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
+ pitch = np.loadtxt(pitch_file_name)[:,0]
+ voicing = np.loadtxt(pitch_file_name)[:,1]
+ indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
+ pitch = pitch[:indmin]
+ voicing = voicing[:indmin]
+ rmse = rmse[:indmin]
+ voicing = voicing*(rmse > 0.05*np.max(rmse))
+ if "mic_F" in audio_file:
+ idx_correct = np.where(pitch < 125)
+ voicing[idx_correct] = 0
+
+ cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
+
+ # if os.path.basename(f) == 'crepe':
+ # elif (model == 'crepe'):
+ # _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
+ # model_cents = 1200*np.log2(model_frequency/(16000/256))
+ # else:
+ # else:
+ model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
+ model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
+ # model_cents = np.roll(model_cents,-1*3)
+
+ num_frames = min(cent.shape[0],model_cents.shape[0])
+ pitch = pitch[:num_frames]
+ cent = cent[:num_frames]
+ voicing = voicing[:num_frames]
+ model_cents = model_cents[:num_frames]
+
+ voicing_all = np.copy(voicing)
+ # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
+ force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
+ voicing_all[force_out_of_pitch] = 0
+ C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
+
+ # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
+ C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
+ # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
+ list_mean.append(C_correct/C_all)
+ else:
+ fname = f
+ list_mean = []
+ list_std = []
+ for snr_dB in list_snr:
+ C_all = 0
+ C_correct = 0
+ for idx in tqdm.trange(len(list_files)):
+ audio_file = list_files[idx]
+ file_name = os.path.basename(list_files[idx])[:-4]
+
+ audio = np.memmap(list_files[idx], dtype=np.int16)/(2**15 - 1)
+ offset = 432
+ audio = audio[offset:]
+ rmse = np.squeeze(librosa.feature.rms(y = audio,frame_length = 320,hop_length = 160))
+
+ if noise_type != 'synthetic':
+ list_noisefiles = noise_dataset + '*.wav'
+ noise_file = random.choice(glob.glob(list_noisefiles))
+ n = np.memmap(noise_file, dtype=np.int16,mode = 'r')/(2**15 - 1)
+ rand_range = np.random.randint(low = 0, high = (16000*60*5 - audio.shape[0])) # Last 1 minute of noise used for testing
+ n = n[rand_range:rand_range + audio.shape[0]]
+ else:
+ n = np.random.randn(audio.shape[0])
+ n = random_filter(n)
+
+ snr_multiplier = np.sqrt((np.sum(np.abs(audio)**2)/np.sum(np.abs(n)**2))*10**(-snr_dB/10))
+ audio = audio + snr_multiplier*n
+
+ if (f == 'crepe'):
+ _, model_frequency, _, _ = crepe.predict(np.concatenate([np.zeros(80),audio]), 16000, viterbi=True,center=True,verbose=0)
+ model_cents = 1200*np.log2(model_frequency/(16000/256) + 1.0e-8)
+ else:
+ data_temp = np.memmap('./temp.raw', dtype=np.int16, shape=(audio.shape[0]), mode='w+')
+ # data_temp[:audio.shape[0]] = (audio/(np.max(np.abs(audio)))*(2**15 - 1)).astype(np.int16)
+ data_temp[:audio.shape[0]] = ((audio)*(2**15 - 1)).astype(np.int16)
+
+ subprocess.run(["../../../lpcnet_xcorr_extractor", './temp.raw', './temp_xcorr.f32', './temp_period.f32'])
+ feature_xcorr = np.fromfile('./temp_period.f32', dtype='float32')
+ model_cents = 1200*np.log2((256/feature_xcorr + 1.0e-8) + 1.0e-8)
+
+ os.remove('./temp.raw')
+ os.remove('./temp_xcorr.f32')
+ os.remove('./temp_period.f32')
+
+
+ pitch_file_name = dir_f0 + "ref" + os.path.basename(list_files[idx])[3:-4] + ".f0"
+ pitch = np.loadtxt(pitch_file_name)[:,0]
+ voicing = np.loadtxt(pitch_file_name)[:,1]
+ indmin = min(voicing.shape[0],rmse.shape[0],pitch.shape[0])
+ pitch = pitch[:indmin]
+ voicing = voicing[:indmin]
+ rmse = rmse[:indmin]
+ voicing = voicing*(rmse > 0.05*np.max(rmse))
+ if "mic_F" in audio_file:
+ idx_correct = np.where(pitch < 125)
+ voicing[idx_correct] = 0
+
+ cent = np.rint(1200*np.log2(np.divide(pitch, (16000/256), out=np.zeros_like(pitch), where=pitch!=0) + 1.0e-8)).astype('int')
+ num_frames = min(cent.shape[0],model_cents.shape[0])
+ pitch = pitch[:num_frames]
+ cent = cent[:num_frames]
+ voicing = voicing[:num_frames]
+ model_cents = model_cents[:num_frames]
+
+ voicing_all = np.copy(voicing)
+ # Forcefully make regions where pitch is <65 or greater than 500 unvoiced for relevant accurate pitch comparisons for our model
+ force_out_of_pitch = np.where(np.logical_or(pitch < 65,pitch > 500)==True)
+ voicing_all[force_out_of_pitch] = 0
+ C_all = C_all + np.where(voicing_all != 0)[0].shape[0]
+
+ # list_rca_model_all.append(sweep_rca(cent,model_cents,voicing_all,thresh,[0]))
+ C_correct = C_correct + rca(cent,model_cents,voicing_all,thresh)
+ # list_rca_model_all.append(np.count_nonzero(np.where(np.abs(cent - model_cents))))
+ list_mean.append(C_correct/C_all)
+ dict_models[fname] = {}
+ dict_models[fname]['list_SNR'] = list_mean[:-1]
+ dict_models[fname]['inf'] = list_mean[-1]
+
+ return dict_models
diff --git a/dnn/torch/neural-pitch/experiments.py b/dnn/torch/neural-pitch/experiments.py
new file mode 100644
index 00000000..bc8ea7e3
--- /dev/null
+++ b/dnn/torch/neural-pitch/experiments.py
@@ -0,0 +1,38 @@
+"""
+Running the experiments;
+ 1. RCA vs SNR for our models, CREPE, LPCNet
+"""
+
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('ptdb_root', type=str, help='Root Directory for PTDB generated by running ptdb_process.sh ')
+parser.add_argument('output', type=str, help='Output dump file name')
+parser.add_argument('method', type=str, help='Output Directory to save experiment dumps',choices=['model','lpcnet','crepe'])
+parser.add_argument('--noise_dataset', type=str, help='Location of the Demand Datset',default = './',required=False)
+parser.add_argument('--noise_type', type=str, help='Type of additive noise',default = 'synthetic',choices=['synthetic','demand'],required=False)
+parser.add_argument('--pth_file', type=str, help='.pth file to analyze',default = './',required = False)
+parser.add_argument('--fraction_files_analyze', type=float, help='Fraction of PTDB dataset to test on',default = 1,required = False)
+parser.add_argument('--threshold_rca', type=float, help='Cent threshold when computing RCA',default = 50,required = False)
+parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+
+args = parser.parse_args()
+
+import os
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+import json
+from evaluation import cycle_eval
+
+if args.method == 'model':
+ dict_store = cycle_eval([args.pth_file], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
+else:
+ dict_store = cycle_eval([args.method], noise_type = args.noise_type, noise_dataset = args.noise_dataset, list_snr = [-20,-15,-10,-5,0,5,10,15,20], ptdb_dataset_path = args.ptdb_root,fraction = args.fraction_files_analyze,thresh = args.threshold_rca)
+
+dict_store["method"] = args.method
+if args.method == 'model':
+ dict_store['pth'] = args.pth_file
+
+with open(args.output, 'w') as fp:
+ json.dump(dict_store, fp)
diff --git a/dnn/torch/neural-pitch/export_neuralpitch_weights.py b/dnn/torch/neural-pitch/export_neuralpitch_weights.py
new file mode 100644
index 00000000..be374281
--- /dev/null
+++ b/dnn/torch/neural-pitch/export_neuralpitch_weights.py
@@ -0,0 +1,89 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import argparse
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../weight-exchange'))
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint')
+parser.add_argument('output_dir', type=str, help='output folder')
+
+args = parser.parse_args()
+
+import torch
+import numpy as np
+
+from models import large_if_ccode
+from wexchange.torch import dump_torch_weights
+from wexchange.c_export import CWriter, print_vector
+
+def c_export(args, model):
+
+ message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"
+
+ enc_writer = CWriter(os.path.join(args.output_dir, "neural_pitch_data"), message=message, model_struct_name='nnpitch')
+ enc_writer.header.write(
+f"""
+#include "opus_types.h"
+"""
+ )
+
+
+ # encoder
+ encoder_dense_layers = [
+ ('initial' , 'initial', 'TANH'),
+ ('upsample' , 'upsample', 'TANH')
+ ]
+
+ for name, export_name, _ in encoder_dense_layers:
+ layer = model.get_submodule(name)
+ dump_torch_weights(enc_writer, layer, name=export_name, verbose=True)
+
+
+ encoder_gru_layers = [
+ ('gru' , 'gru', 'TANH'),
+ ]
+
+ enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, verbose=True, input_sparse=False, quantize=False)
+ for name, export_name, _ in encoder_gru_layers])
+
+ del enc_writer
+
+
+if __name__ == "__main__":
+
+ os.makedirs(args.output_dir, exist_ok=True)
+ model = large_if_ccode()
+ model.load_state_dict(torch.load(args.checkpoint,map_location='cpu'))
+ c_export(args, model)
diff --git a/dnn/torch/neural-pitch/models.py b/dnn/torch/neural-pitch/models.py
new file mode 100644
index 00000000..426f53ce
--- /dev/null
+++ b/dnn/torch/neural-pitch/models.py
@@ -0,0 +1,218 @@
+"""
+Pitch Estimation Models and dataloaders
+ - Classification Based (Input features, output logits)
+"""
+
+import torch
+import numpy as np
+
+class large_if_ccode(torch.nn.Module):
+
+ def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
+ super(large_if_ccode,self).__init__()
+
+ self.activation = torch.nn.Tanh()
+ self.initial = torch.nn.Linear(input_dim,gru_dim)
+ self.hidden = torch.nn.Linear(gru_dim,gru_dim)
+ self.gru = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,batch_first = True)
+ self.upsample = torch.nn.Linear(gru_dim,output_dim)
+
+ def forward(self, x):
+
+ x = self.initial(x)
+ x = self.activation(x)
+ x = self.hidden(x)
+ x = self.activation(x)
+ x,_ = self.gru(x)
+ x = self.upsample(x)
+ x = self.activation(x)
+ x = x.permute(0,2,1)
+
+ return x
+
+class large_xcorr(torch.nn.Module):
+
+ def __init__(self,input_dim = 90,gru_dim = 64,output_dim = 192):
+ super(large_xcorr,self).__init__()
+
+ self.activation = torch.nn.Tanh()
+
+ self.conv = torch.nn.Sequential(
+ torch.nn.ZeroPad2d((2,0,1,1)),
+ torch.nn.Conv2d(1, 8, 3, bias = True),
+ self.activation,
+ torch.nn.ZeroPad2d((2,0,1,1)),
+ torch.nn.Conv2d(8, 8, 3, bias = True),
+ self.activation,
+ torch.nn.ZeroPad2d((2,0,1,1)),
+ torch.nn.Conv2d(8, 1, 3, bias = True),
+ self.activation,
+ )
+
+ # self.conv = torch.nn.Sequential(
+ # torch.nn.ConstantPad1d((2,0),0),
+ # torch.nn.Conv1d(64,10,3),
+ # self.activation,
+ # torch.nn.ConstantPad1d((2,0),0),
+ # torch.nn.Conv1d(10,64,3),
+ # self.activation,
+ # )
+
+ self.downsample = torch.nn.Sequential(
+ torch.nn.Linear(input_dim,gru_dim),
+ self.activation
+ )
+ self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
+ self.upsample = torch.nn.Sequential(
+ torch.nn.Linear(gru_dim,output_dim),
+ self.activation
+ )
+
+ def forward(self, x):
+ # x = x[:,:,:257].unsqueeze(-1)
+ x = self.conv(x.unsqueeze(-1).permute(0,3,2,1)).squeeze(1)
+ # print(x.shape)
+ # x = self.conv(x.permute(0,3,2,1)).squeeze(1)
+ x,_ = self.GRU(self.downsample(x.permute(0,2,1)))
+ x = self.upsample(x).permute(0,2,1)
+
+ # x = self.downsample(x)
+ # x = self.activation(x)
+ # x = self.conv(x.permute(0,2,1)).permute(0,2,1)
+ # x,_ = self.GRU(x)
+ # x = self.upsample(x).permute(0,2,1)
+ return x
+
+class large_joint(torch.nn.Module):
+ """
+ Joint IF-xcorr
+ 1D CNN on IF, merge with xcorr, 2D CNN on merged + GRU
+ """
+
+ def __init__(self,input_IF_dim = 90,input_xcorr_dim = 257,gru_dim = 64,output_dim = 192):
+ super(large_joint,self).__init__()
+
+ self.activation = torch.nn.Tanh()
+
+ self.if_upsample = torch.nn.Sequential(
+ torch.nn.Linear(input_IF_dim,64),
+ self.activation,
+ torch.nn.Linear(64,64),
+ self.activation,
+ )
+
+ # self.if_upsample = torch.nn.Sequential(
+ # torch.nn.ConstantPad1d((2,0),0),
+ # torch.nn.Conv1d(90,10,3),
+ # self.activation,
+ # torch.nn.ConstantPad1d((2,0),0),
+ # torch.nn.Conv1d(10,257,3),
+ # self.activation,
+ # )
+
+ self.conv = torch.nn.Sequential(
+ torch.nn.ZeroPad2d((2,0,1,1)),
+ torch.nn.Conv2d(1, 8, 3, bias = True),
+ self.activation,
+ torch.nn.ZeroPad2d((2,0,1,1)),
+ torch.nn.Conv2d(8, 8, 3, bias = True),
+ self.activation,
+ torch.nn.ZeroPad2d((2,0,1,1)),
+ torch.nn.Conv2d(8, 1, 3, bias = True),
+ self.activation,
+ )
+
+ # self.conv = torch.nn.Sequential(
+ # torch.nn.ConstantPad1d((2,0),0),
+ # torch.nn.Conv1d(257,10,3),
+ # self.activation,
+ # torch.nn.ConstantPad1d((2,0),0),
+ # torch.nn.Conv1d(10,64,3),
+ # self.activation,
+ # )
+
+ self.downsample = torch.nn.Sequential(
+ torch.nn.Linear(64 + input_xcorr_dim,gru_dim),
+ self.activation
+ )
+ self.GRU = torch.nn.GRU(input_size = gru_dim,hidden_size = gru_dim,num_layers = 1,batch_first = True)
+ self.upsample = torch.nn.Sequential(
+ torch.nn.Linear(gru_dim,output_dim),
+ self.activation
+ )
+
+ def forward(self, x):
+ xcorr_feat = x[:,:,:257]
+ if_feat = x[:,:,257:]
+ # x = torch.cat([xcorr_feat.unsqueeze(-1),self.if_upsample(if_feat).unsqueeze(-1)],axis = -1)
+ xcorr_feat = self.conv(xcorr_feat.unsqueeze(-1).permute(0,3,2,1)).squeeze(1).permute(0,2,1)
+ if_feat = self.if_upsample(if_feat)
+ x = torch.cat([xcorr_feat,if_feat],axis = - 1)
+ # x = self.conv(x.permute(0,3,2,1)).squeeze(1)
+ x,_ = self.GRU(self.downsample(x))
+ x = self.upsample(x).permute(0,2,1)
+
+ return x
+
+
+# Dataloaders
+class loader(torch.utils.data.Dataset):
+ def __init__(self, features_if, file_pitch,confidence_threshold = 0.4,dimension_if = 30,context = 100):
+ self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,3*dimension_if)
+
+ # Resolution of 20 cents
+ self.cents = np.rint(np.load(file_pitch)[0,:]/20)
+ self.cents = np.clip(self.cents,0,179)
+ self.confidence = np.load(file_pitch)[1,:]
+
+ # Filter confidence for CREPE
+ self.confidence[self.confidence < confidence_threshold] = 0
+ self.context = context
+ # Clip both to same size
+ size_common = min(self.if_feat.shape[0],self.cents.shape[0])
+ self.if_feat = self.if_feat[:size_common,:]
+ self.cents = self.cents[:size_common]
+ self.confidence = self.confidence[:size_common]
+
+ frame_max = self.if_feat.shape[0]//context
+ self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,3*dimension_if))
+ self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
+ self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
+
+ def __len__(self):
+ return self.if_feat.shape[0]
+
+ def __getitem__(self, index):
+ return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
+
+class loader_joint(torch.utils.data.Dataset):
+ def __init__(self, features_if, file_pitch, features_xcorr,confidence_threshold = 0.4,context = 100, choice_data = 'both'):
+ self.if_feat = np.memmap(features_if, dtype=np.float32).reshape(-1,90)
+ self.xcorr = np.memmap(features_xcorr, dtype=np.float32).reshape(-1,257)
+ self.cents = np.rint(np.load(file_pitch)[0,:]/20)
+ self.cents = np.clip(self.cents,0,179)
+ self.confidence = np.load(file_pitch)[1,:]
+ # Filter confidence for CREPE
+ self.confidence[self.confidence < confidence_threshold] = 0
+ self.context = context
+
+ self.choice_data = choice_data
+
+ frame_max = self.if_feat.shape[0]//context
+ self.if_feat = np.reshape(self.if_feat[:frame_max*context,:],(frame_max,context,90))
+ self.cents = np.reshape(self.cents[:frame_max*context],(frame_max,context))
+ self.xcorr = np.reshape(self.xcorr[:frame_max*context,:],(frame_max,context,257))
+ # self.cents = np.rint(60*np.log2(256/(self.periods + 1.0e-8))).astype('int')
+ # self.cents = np.clip(self.cents,0,239)
+ self.confidence = np.reshape(self.confidence[:frame_max*context],(frame_max,context))
+ # print(self.if_feat.shape)
+ def __len__(self):
+ return self.if_feat.shape[0]
+
+ def __getitem__(self, index):
+ if self.choice_data == 'both':
+ return torch.cat([torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.if_feat[index,:,:])],dim=-1),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
+ elif self.choice_data == 'if':
+ return torch.from_numpy(self.if_feat[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
+ else:
+ return torch.from_numpy(self.xcorr[index,:,:]),torch.from_numpy(self.cents[index]),torch.from_numpy(self.confidence[index])
diff --git a/dnn/torch/neural-pitch/neural_pitch_update.py b/dnn/torch/neural-pitch/neural_pitch_update.py
new file mode 100644
index 00000000..5d8074cf
--- /dev/null
+++ b/dnn/torch/neural-pitch/neural_pitch_update.py
@@ -0,0 +1,207 @@
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('features', type=str, help='Features generated from dump_data')
+parser.add_argument('data', type=str, help='Data generated from dump_data (offset by 5ms)')
+parser.add_argument('output', type=str, help='output .f32 feature file with replaced neural pitch')
+parser.add_argument('pth_file', type=str, help='.pth file to use for pitch')
+parser.add_argument('path_lpcnet_extractor', type=str, help='path to LPCNet extractor object file (generated on compilation)')
+parser.add_argument('--device', type=str, help='compute device',default = None,required = False)
+parser.add_argument('--replace_xcorr', type = bool, default = False, help='Replace LPCNet xcorr with updated one')
+
+args = parser.parse_args()
+
+import os
+
+from utils import stft, random_filter
+import subprocess
+import numpy as np
+import json
+import torch
+import tqdm
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if device is not None:
+ device = torch.device(args.device)
+
+# Loading the appropriate model
+config_path = os.path.dirname(args.pth_file) + '/' + os.path.basename(args.pth_file).split('_')[0] + '_' + 'config_' + os.path.basename(args.pth_file).split('_')[-1][:-4] + '.json'
+with open(config_path) as json_file:
+ dict_params = json.load(json_file)
+
+if dict_params['data_format'] == 'if':
+ from models import large_if_ccode as model
+ pitch_nn = model(dict_params['freq_keep']*3,dict_params['gru_dim'],dict_params['output_dim']).to(device)
+elif dict_params['data_format'] == 'xcorr':
+ from models import large_xcorr as model
+ pitch_nn = model(dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
+else:
+ from models import large_joint as model
+ pitch_nn = model(dict_params['freq_keep']*3,dict_params['xcorr_dim'],dict_params['gru_dim'],dict_params['output_dim']).to(device)
+
+pitch_nn.load_state_dict(torch.load(args.pth_file))
+pitch_nn = pitch_nn.to(device)
+
+N = dict_params['window_size']
+H = dict_params['hop_factor']
+freq_keep = dict_params['freq_keep']
+
+# import os
+# import argparse
+
+
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+os.environ["OMP_NUM_THREADS"] = "16"
+
+# parser = argparse.ArgumentParser()
+
+# parser.add_argument('features', type=str, help='input features')
+# parser.add_argument('data', type=str, help='input data')
+# parser.add_argument('output', type=str, help='output features')
+# parser.add_argument('--add-confidence', action='store_true', help='add CREPE confidence to features')
+# parser.add_argument('--viterbi', action='store_true', help='enable viterbi algo for pitch tracking')
+
+
+def run_lpc(signal, lpcs, frame_length=160):
+ num_frames, lpc_order = lpcs.shape
+
+ prediction = np.concatenate(
+ [- np.convolve(signal[i * frame_length : (i + 1) * frame_length + lpc_order - 1], lpcs[i], mode='valid') for i in range(num_frames)]
+ )
+ error = signal[lpc_order :] - prediction
+
+ return prediction, error
+
+
+if __name__ == "__main__":
+ args = parser.parse_args()
+
+ features = np.memmap(args.features, dtype=np.float32,mode = 'r').reshape((-1, 36))
+ data = np.memmap(args.data, dtype=np.int16,mode = 'r').reshape((-1, 2))
+
+ num_frames = features.shape[0]
+ feature_dim = features.shape[1]
+
+ assert feature_dim == 36
+
+ # if args.add_confidence:
+ # feature_dim += 1
+
+ output = np.memmap(args.output, dtype=np.float32, shape=(num_frames, feature_dim), mode='w+')
+ output[:, :36] = features
+
+ # lpc coefficients and signal
+ lpcs = features[:, 20:36]
+ sig = data[:, 1]
+
+ # parameters
+ # use_viterbi=args.viterbi
+
+ # constants
+ pitch_min = 32
+ pitch_max = 256
+ lpc_order = 16
+ fs = 16000
+ frame_length = 160
+ overlap_frames = 100
+ chunk_size = 10000
+ history_length = frame_length * overlap_frames
+ history = np.zeros(history_length, dtype=np.int16)
+ pitch_position=18
+ xcorr_position=19
+ conf_position=36
+
+ num_frames = len(sig) // 160 - 1
+
+ frame_start = 0
+ frame_stop = min(frame_start + chunk_size, num_frames)
+ signal_start = 0
+ signal_stop = frame_stop * frame_length
+
+ niters = (num_frames - 1)//chunk_size
+ for i in tqdm.trange(niters):
+ if (frame_start > num_frames - 1):
+ break
+ chunk = np.concatenate((history, sig[signal_start:signal_stop]))
+ chunk_la = np.concatenate((history, sig[signal_start:signal_stop + 80]))
+ # time, frequency, confidence, _ = crepe.predict(chunk, fs, center=True, viterbi=True,verbose=0)
+
+ # Feature computation
+ spec = stft(x = np.concatenate([np.zeros(80),chunk_la/(2**15 - 1)]), w = 'boxcar', N = N, H = H).T
+ phase_diff = spec*np.conj(np.roll(spec,1,axis = -1))
+ phase_diff = phase_diff/(np.abs(phase_diff) + 1.0e-8)
+ idx_save = np.concatenate([np.arange(freq_keep),(N//2 + 1) + np.arange(freq_keep),2*(N//2 + 1) + np.arange(freq_keep)])
+ feature = np.concatenate([np.log(np.abs(spec) + 1.0e-8),np.real(phase_diff),np.imag(phase_diff)],axis = 0).T
+ feature_if = feature[:,idx_save]
+
+ data_temp = np.memmap('./temp_featcompute_' + dict_params['data_format'] + '_.raw', dtype=np.int16, shape=(chunk.shape[0]), mode='w+')
+ data_temp[:chunk.shape[0]] = chunk_la[80:].astype(np.int16)
+
+ subprocess.run([args.path_lpcnet_extractor, './temp_featcompute_' + dict_params['data_format'] + '_.raw', './temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw'])
+ feature_xcorr = np.flip(np.fromfile('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw', dtype='float32').reshape((-1,256),order = 'C'),axis = 1)
+ ones_zero_lag = np.expand_dims(np.ones(feature_xcorr.shape[0]),-1)
+ feature_xcorr = np.concatenate([ones_zero_lag,feature_xcorr],axis = -1)
+
+ os.remove('./temp_featcompute_' + dict_params['data_format'] + '_.raw')
+ os.remove('./temp_featcompute_xcorr_' + dict_params['data_format'] + '_.raw')
+
+ if dict_params['data_format'] == 'if':
+ feature = feature_if
+ elif dict_params['data_format'] == 'xcorr':
+ feature = feature_xcorr
+ else:
+ indmin = min(feature_if.shape[0],feature_xcorr.shape[0])
+ feature = np.concatenate([feature_xcorr[:indmin,:],feature_if[:indmin,:]],-1)
+
+ # Compute pitch with my model
+ model_cents = pitch_nn(torch.from_numpy(np.copy(np.expand_dims(feature,0))).float().to(device))
+ model_cents = 20*model_cents.argmax(dim=1).cpu().detach().squeeze().numpy()
+ frequency = 62.5*2**(model_cents/1200)
+
+ frequency = frequency[overlap_frames : overlap_frames + frame_stop - frame_start]
+ # confidence = confidence[overlap_frames : overlap_frames + frame_stop - frame_start]
+
+ # convert frequencies to periods
+ periods = np.round(fs / frequency)
+
+ # adjust to pitch range
+ # confidence[periods < pitch_min] = 0
+ # confidence[periods > pitch_max] = 0
+ periods = np.clip(periods, pitch_min, pitch_max)
+
+ output[frame_start:frame_stop, pitch_position] = (periods - 100) / 50
+
+ # if args.replace_xcorr:
+ # re-calculate xcorr
+ frame_offset = (pitch_max + frame_length - 1) // frame_length
+ offset = frame_offset * frame_length
+ padding = lpc_order
+
+
+ if frame_start < frame_offset:
+ lpc_coeffs = np.concatenate((np.zeros((frame_offset - frame_start, lpc_order), dtype=np.float32), lpcs[:frame_stop]))
+ else:
+ lpc_coeffs = lpcs[frame_start - frame_offset : frame_stop]
+
+ pred, error = run_lpc(chunk[history_length - offset - padding :], lpc_coeffs, frame_length=frame_length)
+
+ xcorr = np.zeros(frame_stop - frame_start)
+ for i, p in enumerate(periods.astype(np.int16)):
+ if p > 0:
+ f1 = error[offset + i * frame_length : offset + (i + 1) * frame_length]
+ f2 = error[offset + i * frame_length - p : offset + (i + 1) * frame_length - p]
+ xcorr[i] = np.dot(f1, f2) / np.sqrt(np.dot(f1, f1) * np.dot(f2, f2) + 1e-6)
+
+ output[frame_start:frame_stop, xcorr_position] = xcorr - 0.5
+
+ # update buffers and indices
+ history = chunk[-history_length :]
+
+ frame_start += chunk_size
+ frame_stop += chunk_size
+ frame_stop = min(frame_stop, num_frames)
+
+ signal_start = frame_start * frame_length
+ signal_stop = frame_stop * frame_length
diff --git a/dnn/torch/neural-pitch/ptdb_process.sh b/dnn/torch/neural-pitch/ptdb_process.sh
new file mode 100644
index 00000000..f4df5465
--- /dev/null
+++ b/dnn/torch/neural-pitch/ptdb_process.sh
@@ -0,0 +1,34 @@
+# Copy into PTDB root directory and run to combine all the male/female raw audio/references into below directories
+
+# Make folder for combined audio
+mkdir -p './combined_mic_16k/'
+# Make folder for combined pitch reference
+mkdir -p './combined_reference_f0/'
+
+# Resample Male Audio
+for i in ./MALE/MIC/**/*.wav; do
+j="$(basename "$i" .wav)"
+echo $j
+sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
+done
+
+# Resample Female Audio
+for i in ./FEMALE/MIC/**/*.wav; do
+j="$(basename "$i" .wav)"
+echo $j
+sox -r 48000 -b 16 -e signed-integer "$i" -r 16000 -b 16 -e signed-integer ./combined_mic_16k/$j.raw
+done
+
+# Shift Male reference pitch files
+for i in ./MALE/REF/**/*.f0; do
+j="$(basename "$i" .wav)"
+echo $j
+cp "$i" ./combined_reference_f0/
+done
+
+# Shift Female reference pitch files
+for i in ./FEMALE/REF/**/*.f0; do
+j="$(basename "$i" .wav)"
+echo $j
+cp "$i" ./combined_reference_f0/
+done \ No newline at end of file
diff --git a/dnn/torch/neural-pitch/training.py b/dnn/torch/neural-pitch/training.py
new file mode 100644
index 00000000..bc0cce7c
--- /dev/null
+++ b/dnn/torch/neural-pitch/training.py
@@ -0,0 +1,162 @@
+"""
+Training the neural pitch estimator
+
+"""
+
+import argparse
+parser = argparse.ArgumentParser()
+
+parser.add_argument('features_if', type=str, help='.f32 IF Features for training (generated by augmentation script)')
+parser.add_argument('features_xcorr', type=str, help='.f32 Xcorr Features for training (generated by augmentation script)')
+parser.add_argument('features_pitch', type=str, help='.npy Pitch file for training (generated by augmentation script)')
+parser.add_argument('output_folder', type=str, help='Output directory to store the model weights and config')
+parser.add_argument('data_format', type=str, help='Choice of Input Data',choices=['if','xcorr','both'])
+parser.add_argument('--gpu_index', type=int, help='GPU index to use if multiple GPUs',default = 0,required = False)
+parser.add_argument('--confidence_threshold', type=float, help='Confidence value below which pitch will be neglected during training',default = 0.4,required = False)
+parser.add_argument('--context', type=int, help='Sequence length during training',default = 100,required = False)
+parser.add_argument('--N', type=int, help='STFT window size',default = 320,required = False)
+parser.add_argument('--H', type=int, help='STFT Hop size',default = 160,required = False)
+parser.add_argument('--xcorr_dimension', type=int, help='Dimension of Input cross-correlation',default = 257,required = False)
+parser.add_argument('--freq_keep', type=int, help='Number of Frequencies to keep',default = 30,required = False)
+parser.add_argument('--gru_dim', type=int, help='GRU Dimension',default = 64,required = False)
+parser.add_argument('--output_dim', type=int, help='Output dimension',default = 192,required = False)
+parser.add_argument('--learning_rate', type=float, help='Learning Rate',default = 1.0e-3,required = False)
+parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False)
+parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False)
+
+
+args = parser.parse_args()
+
+# import os
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_index)
+
+# Fixing the seeds for reproducability
+import time
+np_seed = int(time.time())
+torch_seed = int(time.time())
+
+import json
+import torch
+torch.manual_seed(torch_seed)
+import numpy as np
+np.random.seed(np_seed)
+from utils import count_parameters
+import tqdm
+import sys
+from datetime import datetime
+from evaluation import rpa
+
+# print(list(range(torch.cuda.device_count())))
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# device = 'cpu'
+
+from models import loader_joint as loader
+if args.data_format == 'if':
+ from models import large_if_ccode as model
+ pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim)
+elif args.data_format == 'xcorr':
+ from models import large_xcorr as model
+ pitch_nn = model(args.xcorr_dimension,args.gru_dim,args.output_dim)
+else:
+ from models import large_joint as model
+ pitch_nn = model(args.freq_keep*3,args.xcorr_dimension,args.gru_dim,args.output_dim)
+
+dataset_training = loader(args.features_if,args.features_pitch,args.features_xcorr,args.confidence_threshold,args.context,args.data_format)
+
+def loss_custom(logits,labels,confidence,choice = 'default',nmax = 192,q = 0.7):
+ logits_softmax = torch.nn.Softmax(dim = 1)(logits).permute(0,2,1)
+ labels_one_hot = torch.nn.functional.one_hot(labels.long(),nmax)
+
+ if choice == 'default':
+ # Categorical Cross Entropy
+ CE = -torch.sum(torch.log(logits_softmax*labels_one_hot + 1.0e-6)*labels_one_hot,dim=-1)
+ CE = torch.sum(confidence*CE)
+
+ else:
+ # Robust Cross Entropy
+ CE = (1.0/q)*(1 - torch.sum(torch.pow(logits_softmax*labels_one_hot + 1.0e-7,q),dim=-1) )
+ CE = torch.sum(confidence*CE)
+
+ return CE
+
+# features = args.features
+# pitch = args.crepe_pitch
+# dataset_training = loader(features,pitch,args.confidence_threshold,args.freq_keep,args.context)
+# dataset_training = loader(features,pitch,'../../../../testing/testing_features_10pct_xcorr.f32')
+
+train_dataset, test_dataset = torch.utils.data.random_split(dataset_training, [0.95,0.05],generator=torch.Generator().manual_seed(torch_seed))
+
+batch_size = 256
+train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
+test_dataloader = torch.utils.data.DataLoader(dataset = test_dataset,batch_size = batch_size,shuffle = True,num_workers = 0, pin_memory = False)
+
+# pitch_nn = model(args.freq_keep*3,args.gru_dim,args.output_dim).to(device)
+pitch_nn = pitch_nn.to(device)
+num_params = count_parameters(pitch_nn)
+learning_rate = args.learning_rate
+model_opt = torch.optim.Adam(pitch_nn.parameters(), lr = learning_rate)
+
+num_epochs = args.epochs
+
+for epoch in range(num_epochs):
+ losses = []
+ pitch_nn.train()
+ with tqdm.tqdm(train_dataloader) as train_epoch:
+ for i, (xi, yi, ci) in enumerate(train_epoch):
+ yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
+ pi = pitch_nn(xi.float())
+ loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
+
+ model_opt.zero_grad()
+ loss.backward()
+ model_opt.step()
+
+ losses.append(loss.item())
+ avg_loss = np.mean(losses)
+ train_epoch.set_postfix({"Train Epoch" : epoch, "Train Loss":avg_loss})
+
+ if epoch % 5 == 0:
+ pitch_nn.eval()
+ losses = []
+ with tqdm.tqdm(test_dataloader) as test_epoch:
+ for i, (xi, yi, ci) in enumerate(test_epoch):
+ yi, xi, ci = yi.to(device, non_blocking=True), xi.to(device, non_blocking=True), ci.to(device, non_blocking=True)
+ pi = pitch_nn(xi.float())
+ loss = loss_custom(logits = pi,labels = yi,confidence = ci,choice = args.choice_cel,nmax = args.output_dim)
+ losses.append(loss.item())
+ avg_loss = np.mean(losses)
+ test_epoch.set_postfix({"Epoch" : epoch, "Test Loss":avg_loss})
+
+pitch_nn.eval()
+rpa(pitch_nn,device,data_format = args.data_format)
+
+config = dict(
+data_format = args.data_format,
+epochs = num_epochs,
+window_size = args.N,
+hop_factor = args.H,
+freq_keep = args.freq_keep,
+batch_size = batch_size,
+learning_rate = learning_rate,
+confidence_threshold = args.confidence_threshold,
+model_parameters = num_params,
+np_seed = np_seed,
+torch_seed = torch_seed,
+xcorr_dim = args.xcorr_dimension,
+dim_input = 3*args.freq_keep,
+gru_dim = args.gru_dim,
+output_dim = args.output_dim,
+choice_cel = args.choice_cel,
+context = args.context,
+)
+
+now = datetime.now()
+dir_pth_save = args.output_folder
+dir_network = dir_pth_save + str(now) + '_net_' + args.data_format + '.pth'
+dir_dictparams = dir_pth_save + str(now) + '_config_' + args.data_format + '.json'
+# Save Weights
+torch.save(pitch_nn.state_dict(), dir_network)
+# Save Config
+with open(dir_dictparams, 'w') as fp:
+ json.dump(config, fp)
diff --git a/dnn/torch/neural-pitch/utils.py b/dnn/torch/neural-pitch/utils.py
new file mode 100644
index 00000000..8930ad19
--- /dev/null
+++ b/dnn/torch/neural-pitch/utils.py
@@ -0,0 +1,59 @@
+"""
+Utility functions that are commonly used
+"""
+
+import numpy as np
+from scipy.signal import windows, lfilter
+from prettytable import PrettyTable
+
+
+# Source: https://gist.github.com/thongonary/026210fc186eb5056f2b6f1ca362d912
+def count_parameters(model):
+ table = PrettyTable(["Modules", "Parameters"])
+ total_params = 0
+ for name, parameter in model.named_parameters():
+ if not parameter.requires_grad: continue
+ param = parameter.numel()
+ table.add_row([name, param])
+ total_params+=param
+ print(table)
+ print(f"Total Trainable Params: {total_params}")
+ return total_params
+
+def stft(x, w = 'boxcar', N = 320, H = 160):
+ x = np.concatenate([x,np.zeros(N)])
+ # win_custom = np.concatenate([windows.hann(80)[:40],np.ones(240),windows.hann(80)[40:]])
+ return np.stack([np.fft.rfft(x[i:i + N]*windows.get_window(w,N)) for i in np.arange(0,x.shape[0]-N,H)])
+
+def random_filter(x):
+ # Randomly filter x with second order IIR filter with coefficients in between -3/8,3/8
+ filter_coeff = np.random.uniform(low = -3.0/8, high = 3.0/8, size = 4)
+ b = [1,filter_coeff[0],filter_coeff[1]]
+ a = [1,filter_coeff[2],filter_coeff[3]]
+ return lfilter(b,a,x)
+
+def feature_xform(feature):
+ """
+ Take as input the (N * 256) xcorr features output by LPCNet and perform the following
+ 1. Downsample and Upsample by 2 (followed by smoothing)
+ 2. Append positional embeddings (of dim k) coresponding to each xcorr lag
+ """
+
+ from scipy.signal import resample_poly, lfilter
+
+
+ feature_US = lfilter([0.25,0.5,0.25],[1],resample_poly(feature,2,1,axis = 1),axis = 1)[:,:feature.shape[1]]
+ feature_DS = lfilter([0.5,0.5],[1],resample_poly(feature,1,2,axis = 1),axis = 1)
+ Z_append = np.zeros((feature.shape[0],feature.shape[1] - feature_DS.shape[1]))
+ feature_DS = np.concatenate([feature_DS,Z_append],axis = -1)
+
+ # pos_embedding = []
+ # for i in range(k):
+ # pos_embedding.append(np.cos((2**i)*np.pi*((np.repeat(np.arange(feature.shape[1]).reshape(feature.shape[1],1),feature.shape[0],axis = 1)).T/(2*feature.shape[1]))))
+
+ # pos_embedding = np.stack(pos_embedding,axis = -1)
+
+ feature = np.stack((feature_DS,feature,feature_US),axis = -1)
+ # feature = np.concatenate((feature,pos_embedding),axis = -1)
+
+ return feature