diff options
author | Jan Buethe <jbuethe@amazon.de> | 2023-07-23 01:16:23 +0300 |
---|---|---|
committer | Jan Buethe <jbuethe@amazon.de> | 2023-07-23 01:16:23 +0300 |
commit | 587c1020feea25920851e984f7e2aef784263a57 (patch) | |
tree | 84012b84dde76141efb078023e23dac4755907b2 | |
parent | 7487168d52edd79c5dba9c10007c1aa821893a76 (diff) |
clean-up
-rw-r--r-- | dnn/torch/testsuite/run_test.py | 11 | ||||
-rw-r--r-- | dnn/torch/testsuite/utils/warpq.py | 177 |
2 files changed, 3 insertions, 185 deletions
diff --git a/dnn/torch/testsuite/run_test.py b/dnn/torch/testsuite/run_test.py index 0d828b2d..69463ddb 100644 --- a/dnn/torch/testsuite/run_test.py +++ b/dnn/torch/testsuite/run_test.py @@ -37,7 +37,6 @@ import shutil import yaml from utils.files import get_wave_file_list -from utils.warpq import compute_WAPRQ from utils.pesq import compute_PESQ from utils.pitch import compute_pitch_error @@ -51,7 +50,7 @@ parser.add_argument('--seed', type=int, help='seed for random item selection', d parser.add_argument('--fs', type=int, help="sampling rate at which input is presented as wave file (defaults to 16000)", default=16000) parser.add_argument('--num-workers', type=int, help="number of subprocesses to be used (default=4)", default=4) parser.add_argument('--plc-suffix', type=str, default="_is_lost.txt", help="suffix of plc error pattern file: only relevant if command chain uses PLCFILE (default=_is_lost.txt)") -parser.add_argument('--metrics', type=str, default='warpq', help='comma separated string of metrics, supported: {{"warpq", "pesq"}}, default="warpq"') +parser.add_argument('--metrics', type=str, default='pesq', help='comma separated string of metrics, supported: {{"pesq", "pitch_error", "voicing_error"}}, default="pesq"') parser.add_argument('--verbose', action='store_true', help='enables printouts of all commands run in the pipeline') def check_for_sox_in_path(): @@ -69,7 +68,7 @@ def run_save_sh(command, verbose=False): raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}") -def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'warpq'}, plc_suffix="_is_lost.txt", verbose=False): +def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'pesq'}, plc_suffix="_is_lost.txt", verbose=False): # prepare model input model_input = output_path + ".resamp.wav" @@ -86,10 +85,7 @@ def run_processing_chain(input_path, output_path, model_commands, fs, metrics={' scores = dict() cache = dict() for metric in metrics: - if metric == 'warpq': - # run warpq - score = compute_WAPRQ(input_path, output_path, sr=fs) - elif metric == 'pesq': + if metric == 'pesq': # run pesq score = compute_PESQ(input_path, output_path, fs=fs) elif metric == 'pitch_error': @@ -241,7 +237,6 @@ def create_html(output_folder, results, title, metric): """) metric_sorting_signs = { - 'warpq' : -1, 'pesq' : 1, 'pitch_error' : -1, 'voicing_error' : -1 diff --git a/dnn/torch/testsuite/utils/warpq.py b/dnn/torch/testsuite/utils/warpq.py deleted file mode 100644 index 4d5b7877..00000000 --- a/dnn/torch/testsuite/utils/warpq.py +++ /dev/null @@ -1,177 +0,0 @@ - -""" -WARP-Q: Quality Prediction For Generative Neural Speech Codecs - -This is the WARP-Q version used in the ICASSP 2021 Paper: - -W. A. Jassim, J. Skoglund, M. Chinen, and A. Hines, “WARP-Q: Quality prediction -for generative neural speech codecs,” paper accepted for presentation at the 2021 IEEE -International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021). -Date of acceptance: 30 Jan 2021. Preprint: https://arxiv.org/pdf/2102.10449 - -Run using python 3.x and include these package dependencies in your virtual environment: - - pandas - - librosa - - numpy - - pyvad - - skimage - - speechpy - - soundfile - - scipy (optional) - - seaborn (optional, for plotting only) - - multiprocessing (optional, for parallel computing mode only) - - joblib (optional, for parallel computing mode only) - -Input: - - The main_test function calls a csv file that contains paths of audio files. - - The csv file cosists of four columns: - - Ref_Wave: reference speech - - Test_Wave: test speech - - MOS: subjective score (optinal, for plotting only) - - Codec: type of speech codec for the test speech (optinal, for plotting only) - -Output: - - Code will compute the WARP-Q quality scores between Ref_Wave and Test_Wave, - and will store the obrained results in a new column in the same csv file. - - -Releases: - -Warning: While this code has been tested and commented giving invalid input -files may cause unexpected results and will not be caught by robust exception -handling or validation checking. It will just fail or give you the wrong answer. - -In this simple and basic demo, we compute WARP-Q scores for 8 speech samples only. -More data should should be provided to have better score distributions. - - -(c) Dr Wissam Jassim - University College Dublin - wissam.a.jassim@gmail.com - wissam.jassim@ucd.ie - November 28, 2020 - -""" - -# Load libraries -import librosa, librosa.core, librosa.display -import numpy as np -from pyvad import vad -from skimage.util.shape import view_as_windows -import speechpy -import soundfile as sf - -################################ WARP-Q ####################################### -def compute_WAPRQ(ref_path,test_path,sr=16000,n_mfcc=12,fmax=5000,patch_size=0.4, - sigma=np.array([[1,1],[3,2],[1,3]])): - - # Inputs: - # refPath: path of reference speech - # disPath: path pf degraded speech - # sr: sampling frequency, Hz - # n_mfcc: number of MFCCs - # fmax: cutoff frequency - # patch_size: size of each patch in s - # sigma: step size conditon for DTW - - # Output: - # WARP-Q quality score between refPath and disPath - - - ####################### Load speech files ################################# - # Load Ref Speech - if ref_path[-4:] == '.wav': - speech_Ref, sr_Ref = librosa.load(ref_path,sr=sr) - else: - if ref_path[-4:] == '.SRC': #For ITUT database if applicable - speech_Ref, sr_Ref = sf.read(ref_path, format='RAW', channels=1, samplerate=16000, - subtype='PCM_16', endian='LITTLE') - if sr_Ref != sr: - speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr) - sr_Ref = sr - - # Load Coded Speech - if test_path[-4:] == '.wav': - speech_Coded, sr_Coded = librosa.load(test_path,sr=sr) - else: - if test_path[-4:] == '.OUT': #For ITUT database if applicable - speech_Coded, sr_Coded = sf.read(test_path, format='RAW', channels=1, samplerate=16000, - subtype='PCM_16', endian='LITTLE') - if sr_Coded != sr: - speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr) - sr_Coded = sr - - if sr_Ref != sr_Coded: - raise ValueError("Reference and degraded signals should have same sampling rate!") - - # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1 - # after resampling (if applicable). We experienced this issue for TCD-VOIP database only - speech_Ref[speech_Ref>1]=1.0 - speech_Ref[speech_Ref<-1]=-1.0 - - speech_Coded[speech_Coded>1]=1.0 - speech_Coded[speech_Coded<-1]=-1.0 - - ########################################################################### - - win_length = int(0.032*sr) #32 ms frame - hop_length = int(0.004*sr) #4 ms overlap - #hop_length = int(0.016*sr) - - n_fft = 2*win_length - lifter = 3 - - # DTW Parameters - Metric = 'euclidean' - - # VAD Parameters - hop_size_vad = 30 - sr_vad = sr - aggresive = 0 - - # VAD for Ref speech - vact1 = vad(speech_Ref, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive) - speech_Ref_vad = speech_Ref[vact1==1] - - # VAD for Coded speech - vact2 = vad(speech_Coded, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive) - speech_Coded_vad = speech_Coded[vact2==1] - - # Compute MFCC features for the two signals - - mfcc_Ref = librosa.feature.mfcc(y=speech_Ref_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax, - n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter) - mfcc_Coded = librosa.feature.mfcc(y=speech_Coded_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax, - n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter) - - # Feature Normalisation using CMVNW method - mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T,win_size=201,variance_normalization=True).T - mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T,win_size=201,variance_normalization=True).T - - # Divid MFCC features of Coded speech into patches - cols = int(patch_size/(hop_length/sr)) - window_shape = (np.size(mfcc_Ref,0), cols) - step = int(cols/2) - - mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step) - - Acc =[] - band_rad = 0.25 - weights_mul=np.array([1, 1, 1]) - - # Compute alignment cose between each patch and Ref MFCC - for i in range(mfcc_Coded_patch.shape[1]): - - patch = mfcc_Coded_patch[0][i] - - D, P = librosa.sequence.dtw(X=patch, Y=mfcc_Ref, metric=Metric, - step_sizes_sigma=sigma, weights_mul=weights_mul, - band_rad=band_rad, subseq=True, backtrack=True) - - P_librosa = P[::-1, :] - b_ast = P_librosa[-1, 1] - - Acc.append(D[-1, b_ast] / D.shape[0]) - - # Final score - return np.median(Acc).item() |