diff options
author | Jan Buethe <jbuethe@amazon.de> | 2023-10-24 17:49:28 +0300 |
---|---|---|
committer | Jan Buethe <jbuethe@amazon.de> | 2023-10-24 17:49:28 +0300 |
commit | b0a525109a71cda3fdd0ddd211222ffb21683537 (patch) | |
tree | bddcf7207d1c3c52aaa60e96c1eb17337a73ac67 | |
parent | 8f89dddd38375311e1d815462b0803f42630d249 (diff) |
updated to new neural pitch datasetopus-ng-2bc
-rw-r--r-- | dnn/torch/osce/README.md | 3 | ||||
-rw-r--r-- | dnn/torch/osce/adv_train_vocoder.py | 22 | ||||
-rw-r--r-- | dnn/torch/osce/data/lpcnet_vocoding_dataset.py | 7 | ||||
-rw-r--r-- | dnn/torch/osce/models/__init__.py | 4 | ||||
-rw-r--r-- | dnn/torch/osce/models/lavoce_400_ar.py | 16 | ||||
-rw-r--r-- | dnn/torch/osce/models/lavoce_400_ar2.py | 295 | ||||
-rw-r--r-- | dnn/torch/osce/test_vocoder.py | 3 | ||||
-rw-r--r-- | dnn/torch/osce/train_vocoder.py | 22 | ||||
-rw-r--r-- | dnn/torch/osce/utils/layers/td_shaper.py | 41 | ||||
-rw-r--r-- | dnn/torch/osce/utils/lpcnet_features.py | 7 |
10 files changed, 373 insertions, 47 deletions
diff --git a/dnn/torch/osce/README.md b/dnn/torch/osce/README.md index b1475d91..c7e0806e 100644 --- a/dnn/torch/osce/README.md +++ b/dnn/torch/osce/README.md @@ -12,3 +12,6 @@ The code is tested with python 3.11. Conda setup is done via `conda activate osce` `python -m pip install -r requirements.txt` + + +## Training Data diff --git a/dnn/torch/osce/adv_train_vocoder.py b/dnn/torch/osce/adv_train_vocoder.py index 3728bc2f..dc24b843 100644 --- a/dnn/torch/osce/adv_train_vocoder.py +++ b/dnn/torch/osce/adv_train_vocoder.py @@ -131,17 +131,6 @@ with open(os.path.join(args.output, setup_name), 'w') as f: yaml.dump(setup, f) -ref = None -# prepare inference test if wanted -inference_test = False -if type(args.test_features) != type(None): - test_features = load_lpcnet_features(args.test_features) - features = test_features['features'] - periods = test_features['periods'] - inference_folder = os.path.join(args.output, 'inference_test') - os.makedirs(inference_folder, exist_ok=True) - inference_test = True - # training parameters batch_size = setup['training']['batch_size'] @@ -170,6 +159,17 @@ if 'validation_dataset' in setup: else: run_validation = False +ref = None +# prepare inference test if wanted +inference_test = False +if type(args.test_features) != type(None): + test_features = load_lpcnet_features(args.test_features, version=data.version) + features = test_features['features'] + periods = test_features['periods'] + inference_folder = os.path.join(args.output, 'inference_test') + os.makedirs(inference_folder, exist_ok=True) + inference_test = True + # create model model = model_dict[model_name](*setup['model']['args'], **setup['model']['kwargs']) diff --git a/dnn/torch/osce/data/lpcnet_vocoding_dataset.py b/dnn/torch/osce/data/lpcnet_vocoding_dataset.py index 36c8c724..d9b5c6b8 100644 --- a/dnn/torch/osce/data/lpcnet_vocoding_dataset.py +++ b/dnn/torch/osce/data/lpcnet_vocoding_dataset.py @@ -86,6 +86,8 @@ class LPCNetVocodingDataset(Dataset): self.getitem = self.getitem_v1 elif self.version == 2: self.getitem = self.getitem_v2 + elif self.version == 3: + self.getitem = self.getitem_v2 else: raise ValueError(f"dataset version {self.version} unknown") @@ -138,7 +140,10 @@ class LPCNetVocodingDataset(Dataset): # convert periods if 'periods' in self.input_features: - sample['periods'] = (0.1 + 50 * sample['periods'] + 100).astype('int16') + if self.version < 3: + sample['periods'] = (0.1 + 50 * sample['periods'] + 100).astype('int16') + else: + sample['periods'] = np.round(np.clip(256./2**(sample['periods']+1.5), 32, 256)).astype('int') signal_start = (self.frame_offset + index * self.frames_per_sample) * self.frame_length signal_stop = (self.frame_offset + (index + 1) * self.frames_per_sample) * self.frame_length diff --git a/dnn/torch/osce/models/__init__.py b/dnn/torch/osce/models/__init__.py index e6bbbc36..eb491464 100644 --- a/dnn/torch/osce/models/__init__.py +++ b/dnn/torch/osce/models/__init__.py @@ -33,6 +33,7 @@ from .lavoce import LaVoce from .lavoce_cont import LaVoceCont from .lavoce_400 import LaVoce400 from .lavoce_400_ar import LaVoce400AR +from .lavoce_400_ar2 import LaVoce400AR2 from .fd_discriminator import TFDMultiResolutionDiscriminator as FDMResDisc model_dict = { @@ -42,5 +43,6 @@ model_dict = { 'lavocecont': LaVoceCont, 'lavoce400': LaVoce400, 'fdmresdisc': FDMResDisc, - 'lavoce400ar': LaVoce400AR + 'lavoce400ar': LaVoce400AR, + 'lavoce400ar2': LaVoce400AR2 } diff --git a/dnn/torch/osce/models/lavoce_400_ar.py b/dnn/torch/osce/models/lavoce_400_ar.py index 4bbb6a17..9955de4d 100644 --- a/dnn/torch/osce/models/lavoce_400_ar.py +++ b/dnn/torch/osce/models/lavoce_400_ar.py @@ -103,9 +103,9 @@ class LaVoce400AR(nn.Module): self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) # non-linear transforms - self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True) - self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k) - self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k) + self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True, kernel_size=4) + self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=4) + self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=4) # combinators self.af2 = LimitedAdaptiveConv1d(2, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) @@ -226,7 +226,15 @@ class LaVoce400AR(nn.Module): frames = [] - for i in range(num_frames): + # pre-load buffer + if signal is not None: + frames = [signal[:, :, :4*self.FRAME_SIZE]] + with torch.no_grad(): + for i in range(4): + y, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True) + last_frame = signal[:, :, i * self.FRAME_SIZE : (i + 1) * self.FRAME_SIZE] + + for i in range(4 if nb_pre_frames > 0 else 0, num_frames): y, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True) y = torch.cat((y, prior[..., i * self.FRAME_SIZE : (i+1) * self.FRAME_SIZE]), dim=1) y, state_af_mix = self.af_mix(y, cf[:, i:i+1], state=state_af_mix, return_state=True) diff --git a/dnn/torch/osce/models/lavoce_400_ar2.py b/dnn/torch/osce/models/lavoce_400_ar2.py new file mode 100644 index 00000000..73487831 --- /dev/null +++ b/dnn/torch/osce/models/lavoce_400_ar2.py @@ -0,0 +1,295 @@ +""" +/* Copyright (c) 2023 Amazon + Written by Jan Buethe */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +""" + + +import torch +from torch import nn +import torch.nn.functional as F + +import numpy as np + +from utils.layers.limited_adaptive_comb1d import LimitedAdaptiveComb1d +from utils.layers.limited_adaptive_conv1d import LimitedAdaptiveConv1d +from utils.layers.ar_filter import ARFilter +from utils.layers.td_shaper import TDShaper +from utils.layers.noise_shaper import NoiseShaper +from utils.complexity import _conv1d_flop_count +from utils.endoscopy import write_data + +from models.nns_base import NNSBase +from models.lpcnet_feature_net import LPCNetFeatureNet +from .scale_embedding import ScaleEmbedding + +class LaVoce400AR2(nn.Module): + """ Linear-Adaptive VOCodEr """ + FEATURE_FRAME_SIZE=160 + FRAME_SIZE=40 + + def __init__(self, + num_features=20, + pitch_embedding_dim=64, + cond_dim=256, + pitch_max=300, + kernel_size=15, + preemph=0.85, + comb_gain_limit_db=-6, + global_gain_limits_db=[-6, 6], + conv_gain_limits_db=[-6, 6], + norm_p=2, + avg_pool_k=4, + pulses=False, + tdshape_kernel_size=4): + + super().__init__() + + + self.num_features = num_features + self.cond_dim = cond_dim + self.pitch_max = pitch_max + self.pitch_embedding_dim = pitch_embedding_dim + self.kernel_size = kernel_size + self.preemph = preemph + self.pulses = pulses + + assert self.FEATURE_FRAME_SIZE % self.FRAME_SIZE == 0 + self.upsamp_factor = self.FEATURE_FRAME_SIZE // self.FRAME_SIZE + + # pitch embedding + self.pitch_embedding = nn.Embedding(pitch_max + 1, pitch_embedding_dim) + + # feature net + self.feature_net = LPCNetFeatureNet(num_features + pitch_embedding_dim, cond_dim, self.upsamp_factor) + + # noise shaper + self.noise_shaper = NoiseShaper(cond_dim, self.FRAME_SIZE) + + # comb filters + left_pad = self.kernel_size // 2 + right_pad = self.kernel_size - 1 - left_pad + self.cf1 = LimitedAdaptiveComb1d(self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, use_bias=False, padding=[left_pad, right_pad], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, global_gain_limits_db=global_gain_limits_db, norm_p=norm_p) + self.cf2 = LimitedAdaptiveComb1d(self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, use_bias=False, padding=[left_pad, right_pad], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, global_gain_limits_db=global_gain_limits_db, norm_p=norm_p) + + self.cf_ar = ARFilter(5, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, padding=[2, 2], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, norm_p=norm_p) + + self.af_prescale = LimitedAdaptiveConv1d(2, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) + self.af_mix = LimitedAdaptiveConv1d(3, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) + + # spectral shaping + self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) + + # non-linear transforms + self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True, kernel_size=tdshape_kernel_size, tanh_activation=True) + self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=tdshape_kernel_size, tanh_activation=True) + self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=tdshape_kernel_size, tanh_activation=True) + + # combinators + self.af2 = LimitedAdaptiveConv1d(3, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) + self.af3 = LimitedAdaptiveConv1d(3, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) + self.af4 = LimitedAdaptiveConv1d(3, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p) + + # feature transforms + self.post_cf1 = nn.Conv1d(cond_dim, cond_dim, 2) + self.post_cf2 = nn.Conv1d(cond_dim, cond_dim, 2) + self.post_af1 = nn.Conv1d(cond_dim, cond_dim, 2) + self.post_af2 = nn.Conv1d(cond_dim, cond_dim, 2) + self.post_af3 = nn.Conv1d(cond_dim, cond_dim, 2) + + + def create_phase_signals(self, periods): + + batch_size = periods.size(0) + progression = torch.arange(1, self.FRAME_SIZE + 1, dtype=periods.dtype, device=periods.device).view((1, -1)) + progression = torch.repeat_interleave(progression, batch_size, 0) + + phase0 = torch.zeros(batch_size, dtype=periods.dtype, device=periods.device).unsqueeze(-1) + chunks = [] + for sframe in range(periods.size(1)): + f = (2.0 * torch.pi / periods[:, sframe]).unsqueeze(-1) + + if self.pulses: + alpha = torch.cos(f).view(batch_size, 1, 1) + chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE) + pulse_a = torch.relu(chunk_sin - alpha) / (1 - alpha) + pulse_b = torch.relu(-chunk_sin - alpha) / (1 - alpha) + + chunk = torch.cat((pulse_a, pulse_b), dim = 1) + else: + chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE) + chunk_cos = torch.cos(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE) + + chunk = torch.cat((chunk_sin, chunk_cos), dim = 1) + + phase0 = phase0 + self.FRAME_SIZE * f + + chunks.append(chunk) + + phase_signals = torch.cat(chunks, dim=-1) + + return phase_signals + + def flop_count(self, rate=16000, verbose=False): + + frame_rate = rate / self.FRAME_SIZE + + # feature net + feature_net_flops = self.feature_net.flop_count(frame_rate) + comb_flops = self.cf1.flop_count(rate) + self.cf2.flop_count(rate) + self.cf_ar.flop_count(rate) + af_flops = self.af1.flop_count(rate) + self.af2.flop_count(rate) + self.af3.flop_count(rate) + self.af4.flop_count(rate) + self.af_mix.flop_count(rate) + self.af_prescale.flop_count(rate) + feature_flops = (_conv1d_flop_count(self.post_cf1, frame_rate) + _conv1d_flop_count(self.post_cf2, frame_rate) + + _conv1d_flop_count(self.post_af1, frame_rate) + _conv1d_flop_count(self.post_af2, frame_rate) + _conv1d_flop_count(self.post_af3, frame_rate)) + + if verbose: + print(f"feature net: {feature_net_flops / 1e6} MFLOPS") + print(f"comb filters: {comb_flops / 1e6} MFLOPS") + print(f"adaptive conv: {af_flops / 1e6} MFLOPS") + print(f"feature transforms: {feature_flops / 1e6} MFLOPS") + + return feature_net_flops + comb_flops + af_flops + feature_flops + + def feature_transform(self, f, layer): + f = f.permute(0, 2, 1) + f = F.pad(f, [1, 0]) + f = torch.tanh(layer(f)) + return f.permute(0, 2, 1) + + def forward(self, features, periods, signal=None, debug=False): + + periods = periods.squeeze(-1) + pitch_embedding = self.pitch_embedding(periods) + + if signal is not None: + nb_pre_frames = signal.size(-1) // self.FRAME_SIZE + if len(signal.shape) < 3: + signal = signal.unsqueeze(1) + else: + nb_pre_frames = 0 + + full_features = torch.cat((features, pitch_embedding), dim=-1) + cf = self.feature_net(full_features) + cf1 = self.feature_transform(cf, self.post_af2) + cf2= self.feature_transform(cf1, self.post_af3) + cf3 = self.feature_transform(cf2, self.post_cf1) + cf4 = self.feature_transform(cf3, self.post_cf2) + cf5 = self.feature_transform(cf4, self.post_af1) + + + # upsample periods + periods = torch.repeat_interleave(periods, self.upsamp_factor, 1) + periods_ar = torch.where(periods > 42, periods, 2*periods) + + num_frames = periods.size(1) + + # pre-net + ref_phase = torch.tanh(self.create_phase_signals(periods)) + x = self.af_prescale(ref_phase, cf) + noise = self.noise_shaper(cf) + prior = torch.cat((x, noise), dim=1) + + # states + state_cf_ar = None + state_af_mix = None + state_tdshape1 = None + state_tdshape2 = None + state_cf1 = None + state_cf2 = None + state_af1 = None + state_af2 = None + state_af3 = None + state_tdshape3 = None + state_af4 = None + last_frame = torch.zeros((features.size(0), 1, self.FRAME_SIZE), device=features.device) + + frames = [] + + # pre-load buffer + if signal is not None: + frames = [signal[:, :, :4*self.FRAME_SIZE]] + with torch.no_grad(): + for i in range(4): + y, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True) + last_frame = signal[:, :, i * self.FRAME_SIZE : (i + 1) * self.FRAME_SIZE] + + + for i in range(4 if nb_pre_frames > 0 else 0, num_frames): + pred, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True) + y = torch.cat((pred, prior[..., i * self.FRAME_SIZE : (i+1) * self.FRAME_SIZE]), dim=1) + y, state_af_mix = self.af_mix(y, cf[:, i:i+1], state=state_af_mix, return_state=True) + + # temporal shaping + innovating + y1 = y[:, 0:1, :] + y2, state_tdshape1 = self.tdshape1(y[:, 1:2, :], cf[:, i:i+1], state=state_tdshape1, return_state=True) + y = torch.cat((y1, y2, pred), dim=1) + y, state_af2 = self.af2(y, cf[:, i:i+1], state=state_af2, return_state=True, debug=debug) + + # second temporal shaping + y1 = y[:, 0:1, :] + y2, state_tdshape2 = self.tdshape2(y[:, 1:2, :], cf1[:, i:i+1], state=state_tdshape2, return_state=True) + y = torch.cat((y1, y2, pred), dim=1) + y, state_af3 = self.af3(y, cf1[:, i:i+1], state=state_af3, return_state=True, debug=debug) + + # spectral shaping + y, state_cf1 = self.cf1(y, cf2[:, i:i+1], periods[:, i:i+1], state=state_cf1, return_state=True, debug=debug) + y, state_cf2 = self.cf2(y, cf3[:, i:i+1], periods[:, i:i+1], state=state_cf2, return_state=True, debug=debug) + y, state_af1 = self.af1(y, cf4[:, i:i+1], state=state_af1, return_state=True, debug=debug) + + # final temporal env adjustment + y1 = y[:, 0:1, :] + y2, state_tdshape3 = self.tdshape3(y[:, 1:2, :], cf5[:, i:i+1], state=state_tdshape3, return_state=True) + y = torch.cat((y1, y2, pred), dim=1) + y, state_af4 = self.af4(y, cf5[:, i:i+1], state=state_af4, return_state=True, debug=debug) + + if i < nb_pre_frames: + y = signal[:, :, i * self.FRAME_SIZE : (i + 1) * self.FRAME_SIZE] + + last_frame = y + frames.append(y) + + return torch.cat(frames, dim=-1) + + def process(self, features, periods, debug=False): + + self.eval() + device = next(iter(self.parameters())).device + with torch.no_grad(): + + # run model + f = features.unsqueeze(0).to(device) + p = periods.unsqueeze(0).to(device) + + y = self.forward(f, p, debug=debug).squeeze() + + # deemphasis + if self.preemph > 0: + for i in range(len(y) - 1): + y[i + 1] += self.preemph * y[i] + + # clip to valid range + out = torch.clip((2**15) * y, -2**15, 2**15 - 1).short() + + return out
\ No newline at end of file diff --git a/dnn/torch/osce/test_vocoder.py b/dnn/torch/osce/test_vocoder.py index e71a5c37..55e5d00c 100644 --- a/dnn/torch/osce/test_vocoder.py +++ b/dnn/torch/osce/test_vocoder.py @@ -55,6 +55,7 @@ else: parser.add_argument('checkpoint', type=str, help='checkpoint file') parser.add_argument('output', type=str, help='output file') parser.add_argument('--debug', action='store_true', help='enables debug output') + parser.add_argument('--feature-version', type=int, help='feature version, default: 3', default=3) args = parser.parse_args() @@ -85,7 +86,7 @@ model.load_state_dict(checkpoint['state_dict']) # generate model input setup = checkpoint['setup'] -testdata = load_lpcnet_features(input_folder) +testdata = load_lpcnet_features(input_folder, version=args.feature_version) features = testdata['features'] periods = testdata['periods'] diff --git a/dnn/torch/osce/train_vocoder.py b/dnn/torch/osce/train_vocoder.py index 3572e962..ce6ef48c 100644 --- a/dnn/torch/osce/train_vocoder.py +++ b/dnn/torch/osce/train_vocoder.py @@ -126,16 +126,6 @@ if has_git: with open(os.path.join(args.output, setup_name), 'w') as f: yaml.dump(setup, f) -ref = None -# prepare inference test if wanted -inference_test = False -if type(args.test_features) != type(None): - test_features = load_lpcnet_features(args.test_features) - features = test_features['features'] - periods = test_features['periods'] - inference_folder = os.path.join(args.output, 'inference_test') - os.makedirs(inference_folder, exist_ok=True) - inference_test = True # training parameters @@ -161,6 +151,18 @@ if 'validation_dataset' in setup: else: run_validation = False +ref = None +# prepare inference test if wanted +inference_test = False +if type(args.test_features) != type(None): + test_features = load_lpcnet_features(args.test_features, version=data.version) + features = test_features['features'] + periods = test_features['periods'] + inference_folder = os.path.join(args.output, 'inference_test') + os.makedirs(inference_folder, exist_ok=True) + inference_test = True + + # create model model = model_dict[model_name](*setup['model']['args'], **setup['model']['kwargs']) diff --git a/dnn/torch/osce/utils/layers/td_shaper.py b/dnn/torch/osce/utils/layers/td_shaper.py index 100a6cff..7bbfa514 100644 --- a/dnn/torch/osce/utils/layers/td_shaper.py +++ b/dnn/torch/osce/utils/layers/td_shaper.py @@ -12,7 +12,9 @@ class TDShaper(nn.Module): frame_size=160, avg_pool_k=4, innovate=False, - pool_after=False + pool_after=False, + kernel_size=2, + tanh_activation=False, ): """ @@ -36,25 +38,29 @@ class TDShaper(nn.Module): super().__init__() - self.feature_dim = feature_dim - self.frame_size = frame_size - self.avg_pool_k = avg_pool_k - self.innovate = innovate - self.pool_after = pool_after + self.feature_dim = feature_dim + self.frame_size = frame_size + self.avg_pool_k = avg_pool_k + self.innovate = innovate + self.pool_after = pool_after + self.kernel_size = kernel_size + self.tanh_activation = tanh_activation assert frame_size % avg_pool_k == 0 self.env_dim = frame_size // avg_pool_k + 1 # feature transform - self.feature_alpha1 = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, 2) - self.feature_alpha2 = nn.Conv1d(frame_size, frame_size, 2) + self.feature_alpha1 = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, kernel_size) + self.feature_alpha2 = nn.Conv1d(frame_size, frame_size, kernel_size) if self.innovate: - self.feature_alpha1b = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, 2) - self.feature_alpha1c = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, 2) + self.feature_alpha1b = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, kernel_size) + self.feature_alpha1c = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, kernel_size) - self.feature_alpha2b = nn.Conv1d(frame_size, frame_size, 2) - self.feature_alpha2c = nn.Conv1d(frame_size, frame_size, 2) + self.feature_alpha2b = nn.Conv1d(frame_size, frame_size, kernel_size) + self.feature_alpha2c = nn.Conv1d(frame_size, frame_size, kernel_size) + + self.activation = torch.tanh if self.tanh_activation else torch.nn.LeakyReLU(0.2) def flop_count(self, rate): @@ -105,6 +111,7 @@ class TDShaper(nn.Module): batch_size = x.size(0) num_frames = features.size(1) num_samples = x.size(2) + padding = 2 * self.kernel_size - 2 # generate temporal envelope tenv = self.envelope_transform(x) @@ -114,17 +121,17 @@ class TDShaper(nn.Module): if state is not None: f = torch.cat((state, f), dim=-1) else: - f = F.pad(f, [2, 0]) - alpha = F.leaky_relu(self.feature_alpha1(f), 0.2) + f = F.pad(f, [padding, 0]) + alpha = self.activation(self.feature_alpha1(f)) alpha = torch.exp(self.feature_alpha2(alpha)) alpha = alpha.permute(0, 2, 1) if self.innovate: - inno_alpha = F.leaky_relu(self.feature_alpha1b(f), 0.2) + inno_alpha = self.activation(self.feature_alpha1b(f)) inno_alpha = torch.exp(self.feature_alpha2b(inno_alpha)) inno_alpha = inno_alpha.permute(0, 2, 1) - inno_x = F.leaky_relu(self.feature_alpha1c(f), 0.2) + inno_x = self.activation(self.feature_alpha1c(f)) inno_x = torch.tanh(self.feature_alpha2c(inno_x)) inno_x = inno_x.permute(0, 2, 1) @@ -138,7 +145,7 @@ class TDShaper(nn.Module): y = y.reshape(batch_size, 1, num_samples) if return_state: - new_state = f[..., -2:] + new_state = f[..., -padding:] return y, new_state else: return y diff --git a/dnn/torch/osce/utils/lpcnet_features.py b/dnn/torch/osce/utils/lpcnet_features.py index 3d109fd3..5125497e 100644 --- a/dnn/torch/osce/utils/lpcnet_features.py +++ b/dnn/torch/osce/utils/lpcnet_features.py @@ -4,7 +4,7 @@ import torch import numpy as np def load_lpcnet_features(feature_file, version=2): - if version == 2: + if version == 2 or version == 3: layout = { 'cepstrum': [0,18], 'periods': [18, 19], @@ -37,7 +37,10 @@ def load_lpcnet_features(feature_file, version=2): ) lpcs = raw_features[:, layout['lpc'][0] : layout['lpc'][1]] - periods = (0.1 + 50 * raw_features[:, layout['periods'][0] : layout['periods'][1]] + 100).long() + if version < 3: + periods = (0.1 + 50 * raw_features[:, layout['periods'][0] : layout['periods'][1]] + 100).long() + else: + periods = torch.round(torch.clip(256./2**(raw_features[:, layout['periods'][0] : layout['periods'][1]] + 1.5), 32, 256)).long() return {'features' : features, 'periods' : periods, 'lpcs' : lpcs} |