Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Buethe <jbuethe@amazon.de>2023-07-23 01:16:23 +0300
committerJan Buethe <jbuethe@amazon.de>2023-07-23 01:16:23 +0300
commit587c1020feea25920851e984f7e2aef784263a57 (patch)
tree84012b84dde76141efb078023e23dac4755907b2
parent7487168d52edd79c5dba9c10007c1aa821893a76 (diff)
clean-up
-rw-r--r--dnn/torch/testsuite/run_test.py11
-rw-r--r--dnn/torch/testsuite/utils/warpq.py177
2 files changed, 3 insertions, 185 deletions
diff --git a/dnn/torch/testsuite/run_test.py b/dnn/torch/testsuite/run_test.py
index 0d828b2d..69463ddb 100644
--- a/dnn/torch/testsuite/run_test.py
+++ b/dnn/torch/testsuite/run_test.py
@@ -37,7 +37,6 @@ import shutil
import yaml
from utils.files import get_wave_file_list
-from utils.warpq import compute_WAPRQ
from utils.pesq import compute_PESQ
from utils.pitch import compute_pitch_error
@@ -51,7 +50,7 @@ parser.add_argument('--seed', type=int, help='seed for random item selection', d
parser.add_argument('--fs', type=int, help="sampling rate at which input is presented as wave file (defaults to 16000)", default=16000)
parser.add_argument('--num-workers', type=int, help="number of subprocesses to be used (default=4)", default=4)
parser.add_argument('--plc-suffix', type=str, default="_is_lost.txt", help="suffix of plc error pattern file: only relevant if command chain uses PLCFILE (default=_is_lost.txt)")
-parser.add_argument('--metrics', type=str, default='warpq', help='comma separated string of metrics, supported: {{"warpq", "pesq"}}, default="warpq"')
+parser.add_argument('--metrics', type=str, default='pesq', help='comma separated string of metrics, supported: {{"pesq", "pitch_error", "voicing_error"}}, default="pesq"')
parser.add_argument('--verbose', action='store_true', help='enables printouts of all commands run in the pipeline')
def check_for_sox_in_path():
@@ -69,7 +68,7 @@ def run_save_sh(command, verbose=False):
raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")
-def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'warpq'}, plc_suffix="_is_lost.txt", verbose=False):
+def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'pesq'}, plc_suffix="_is_lost.txt", verbose=False):
# prepare model input
model_input = output_path + ".resamp.wav"
@@ -86,10 +85,7 @@ def run_processing_chain(input_path, output_path, model_commands, fs, metrics={'
scores = dict()
cache = dict()
for metric in metrics:
- if metric == 'warpq':
- # run warpq
- score = compute_WAPRQ(input_path, output_path, sr=fs)
- elif metric == 'pesq':
+ if metric == 'pesq':
# run pesq
score = compute_PESQ(input_path, output_path, fs=fs)
elif metric == 'pitch_error':
@@ -241,7 +237,6 @@ def create_html(output_folder, results, title, metric):
""")
metric_sorting_signs = {
- 'warpq' : -1,
'pesq' : 1,
'pitch_error' : -1,
'voicing_error' : -1
diff --git a/dnn/torch/testsuite/utils/warpq.py b/dnn/torch/testsuite/utils/warpq.py
deleted file mode 100644
index 4d5b7877..00000000
--- a/dnn/torch/testsuite/utils/warpq.py
+++ /dev/null
@@ -1,177 +0,0 @@
-
-"""
-WARP-Q: Quality Prediction For Generative Neural Speech Codecs
-
-This is the WARP-Q version used in the ICASSP 2021 Paper:
-
-W. A. Jassim, J. Skoglund, M. Chinen, and A. Hines, “WARP-Q: Quality prediction
-for generative neural speech codecs,” paper accepted for presentation at the 2021 IEEE
-International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021).
-Date of acceptance: 30 Jan 2021. Preprint: https://arxiv.org/pdf/2102.10449
-
-Run using python 3.x and include these package dependencies in your virtual environment:
- - pandas
- - librosa
- - numpy
- - pyvad
- - skimage
- - speechpy
- - soundfile
- - scipy (optional)
- - seaborn (optional, for plotting only)
- - multiprocessing (optional, for parallel computing mode only)
- - joblib (optional, for parallel computing mode only)
-
-Input:
- - The main_test function calls a csv file that contains paths of audio files.
- - The csv file cosists of four columns:
- - Ref_Wave: reference speech
- - Test_Wave: test speech
- - MOS: subjective score (optinal, for plotting only)
- - Codec: type of speech codec for the test speech (optinal, for plotting only)
-
-Output:
- - Code will compute the WARP-Q quality scores between Ref_Wave and Test_Wave,
- and will store the obrained results in a new column in the same csv file.
-
-
-Releases:
-
-Warning: While this code has been tested and commented giving invalid input
-files may cause unexpected results and will not be caught by robust exception
-handling or validation checking. It will just fail or give you the wrong answer.
-
-In this simple and basic demo, we compute WARP-Q scores for 8 speech samples only.
-More data should should be provided to have better score distributions.
-
-
-(c) Dr Wissam Jassim
- University College Dublin
- wissam.a.jassim@gmail.com
- wissam.jassim@ucd.ie
- November 28, 2020
-
-"""
-
-# Load libraries
-import librosa, librosa.core, librosa.display
-import numpy as np
-from pyvad import vad
-from skimage.util.shape import view_as_windows
-import speechpy
-import soundfile as sf
-
-################################ WARP-Q #######################################
-def compute_WAPRQ(ref_path,test_path,sr=16000,n_mfcc=12,fmax=5000,patch_size=0.4,
- sigma=np.array([[1,1],[3,2],[1,3]])):
-
- # Inputs:
- # refPath: path of reference speech
- # disPath: path pf degraded speech
- # sr: sampling frequency, Hz
- # n_mfcc: number of MFCCs
- # fmax: cutoff frequency
- # patch_size: size of each patch in s
- # sigma: step size conditon for DTW
-
- # Output:
- # WARP-Q quality score between refPath and disPath
-
-
- ####################### Load speech files #################################
- # Load Ref Speech
- if ref_path[-4:] == '.wav':
- speech_Ref, sr_Ref = librosa.load(ref_path,sr=sr)
- else:
- if ref_path[-4:] == '.SRC': #For ITUT database if applicable
- speech_Ref, sr_Ref = sf.read(ref_path, format='RAW', channels=1, samplerate=16000,
- subtype='PCM_16', endian='LITTLE')
- if sr_Ref != sr:
- speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr)
- sr_Ref = sr
-
- # Load Coded Speech
- if test_path[-4:] == '.wav':
- speech_Coded, sr_Coded = librosa.load(test_path,sr=sr)
- else:
- if test_path[-4:] == '.OUT': #For ITUT database if applicable
- speech_Coded, sr_Coded = sf.read(test_path, format='RAW', channels=1, samplerate=16000,
- subtype='PCM_16', endian='LITTLE')
- if sr_Coded != sr:
- speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr)
- sr_Coded = sr
-
- if sr_Ref != sr_Coded:
- raise ValueError("Reference and degraded signals should have same sampling rate!")
-
- # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1
- # after resampling (if applicable). We experienced this issue for TCD-VOIP database only
- speech_Ref[speech_Ref>1]=1.0
- speech_Ref[speech_Ref<-1]=-1.0
-
- speech_Coded[speech_Coded>1]=1.0
- speech_Coded[speech_Coded<-1]=-1.0
-
- ###########################################################################
-
- win_length = int(0.032*sr) #32 ms frame
- hop_length = int(0.004*sr) #4 ms overlap
- #hop_length = int(0.016*sr)
-
- n_fft = 2*win_length
- lifter = 3
-
- # DTW Parameters
- Metric = 'euclidean'
-
- # VAD Parameters
- hop_size_vad = 30
- sr_vad = sr
- aggresive = 0
-
- # VAD for Ref speech
- vact1 = vad(speech_Ref, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive)
- speech_Ref_vad = speech_Ref[vact1==1]
-
- # VAD for Coded speech
- vact2 = vad(speech_Coded, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive)
- speech_Coded_vad = speech_Coded[vact2==1]
-
- # Compute MFCC features for the two signals
-
- mfcc_Ref = librosa.feature.mfcc(y=speech_Ref_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax,
- n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter)
- mfcc_Coded = librosa.feature.mfcc(y=speech_Coded_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax,
- n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter)
-
- # Feature Normalisation using CMVNW method
- mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T,win_size=201,variance_normalization=True).T
- mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T,win_size=201,variance_normalization=True).T
-
- # Divid MFCC features of Coded speech into patches
- cols = int(patch_size/(hop_length/sr))
- window_shape = (np.size(mfcc_Ref,0), cols)
- step = int(cols/2)
-
- mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step)
-
- Acc =[]
- band_rad = 0.25
- weights_mul=np.array([1, 1, 1])
-
- # Compute alignment cose between each patch and Ref MFCC
- for i in range(mfcc_Coded_patch.shape[1]):
-
- patch = mfcc_Coded_patch[0][i]
-
- D, P = librosa.sequence.dtw(X=patch, Y=mfcc_Ref, metric=Metric,
- step_sizes_sigma=sigma, weights_mul=weights_mul,
- band_rad=band_rad, subseq=True, backtrack=True)
-
- P_librosa = P[::-1, :]
- b_ast = P_librosa[-1, 1]
-
- Acc.append(D[-1, b_ast] / D.shape[0])
-
- # Final score
- return np.median(Acc).item()