Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Buethe <jbuethe@amazon.de>2023-10-24 17:49:28 +0300
committerJan Buethe <jbuethe@amazon.de>2023-10-24 17:49:28 +0300
commitb0a525109a71cda3fdd0ddd211222ffb21683537 (patch)
treebddcf7207d1c3c52aaa60e96c1eb17337a73ac67
parent8f89dddd38375311e1d815462b0803f42630d249 (diff)
updated to new neural pitch datasetopus-ng-2bc
-rw-r--r--dnn/torch/osce/README.md3
-rw-r--r--dnn/torch/osce/adv_train_vocoder.py22
-rw-r--r--dnn/torch/osce/data/lpcnet_vocoding_dataset.py7
-rw-r--r--dnn/torch/osce/models/__init__.py4
-rw-r--r--dnn/torch/osce/models/lavoce_400_ar.py16
-rw-r--r--dnn/torch/osce/models/lavoce_400_ar2.py295
-rw-r--r--dnn/torch/osce/test_vocoder.py3
-rw-r--r--dnn/torch/osce/train_vocoder.py22
-rw-r--r--dnn/torch/osce/utils/layers/td_shaper.py41
-rw-r--r--dnn/torch/osce/utils/lpcnet_features.py7
10 files changed, 373 insertions, 47 deletions
diff --git a/dnn/torch/osce/README.md b/dnn/torch/osce/README.md
index b1475d91..c7e0806e 100644
--- a/dnn/torch/osce/README.md
+++ b/dnn/torch/osce/README.md
@@ -12,3 +12,6 @@ The code is tested with python 3.11. Conda setup is done via
`conda activate osce`
`python -m pip install -r requirements.txt`
+
+
+## Training Data
diff --git a/dnn/torch/osce/adv_train_vocoder.py b/dnn/torch/osce/adv_train_vocoder.py
index 3728bc2f..dc24b843 100644
--- a/dnn/torch/osce/adv_train_vocoder.py
+++ b/dnn/torch/osce/adv_train_vocoder.py
@@ -131,17 +131,6 @@ with open(os.path.join(args.output, setup_name), 'w') as f:
yaml.dump(setup, f)
-ref = None
-# prepare inference test if wanted
-inference_test = False
-if type(args.test_features) != type(None):
- test_features = load_lpcnet_features(args.test_features)
- features = test_features['features']
- periods = test_features['periods']
- inference_folder = os.path.join(args.output, 'inference_test')
- os.makedirs(inference_folder, exist_ok=True)
- inference_test = True
-
# training parameters
batch_size = setup['training']['batch_size']
@@ -170,6 +159,17 @@ if 'validation_dataset' in setup:
else:
run_validation = False
+ref = None
+# prepare inference test if wanted
+inference_test = False
+if type(args.test_features) != type(None):
+ test_features = load_lpcnet_features(args.test_features, version=data.version)
+ features = test_features['features']
+ periods = test_features['periods']
+ inference_folder = os.path.join(args.output, 'inference_test')
+ os.makedirs(inference_folder, exist_ok=True)
+ inference_test = True
+
# create model
model = model_dict[model_name](*setup['model']['args'], **setup['model']['kwargs'])
diff --git a/dnn/torch/osce/data/lpcnet_vocoding_dataset.py b/dnn/torch/osce/data/lpcnet_vocoding_dataset.py
index 36c8c724..d9b5c6b8 100644
--- a/dnn/torch/osce/data/lpcnet_vocoding_dataset.py
+++ b/dnn/torch/osce/data/lpcnet_vocoding_dataset.py
@@ -86,6 +86,8 @@ class LPCNetVocodingDataset(Dataset):
self.getitem = self.getitem_v1
elif self.version == 2:
self.getitem = self.getitem_v2
+ elif self.version == 3:
+ self.getitem = self.getitem_v2
else:
raise ValueError(f"dataset version {self.version} unknown")
@@ -138,7 +140,10 @@ class LPCNetVocodingDataset(Dataset):
# convert periods
if 'periods' in self.input_features:
- sample['periods'] = (0.1 + 50 * sample['periods'] + 100).astype('int16')
+ if self.version < 3:
+ sample['periods'] = (0.1 + 50 * sample['periods'] + 100).astype('int16')
+ else:
+ sample['periods'] = np.round(np.clip(256./2**(sample['periods']+1.5), 32, 256)).astype('int')
signal_start = (self.frame_offset + index * self.frames_per_sample) * self.frame_length
signal_stop = (self.frame_offset + (index + 1) * self.frames_per_sample) * self.frame_length
diff --git a/dnn/torch/osce/models/__init__.py b/dnn/torch/osce/models/__init__.py
index e6bbbc36..eb491464 100644
--- a/dnn/torch/osce/models/__init__.py
+++ b/dnn/torch/osce/models/__init__.py
@@ -33,6 +33,7 @@ from .lavoce import LaVoce
from .lavoce_cont import LaVoceCont
from .lavoce_400 import LaVoce400
from .lavoce_400_ar import LaVoce400AR
+from .lavoce_400_ar2 import LaVoce400AR2
from .fd_discriminator import TFDMultiResolutionDiscriminator as FDMResDisc
model_dict = {
@@ -42,5 +43,6 @@ model_dict = {
'lavocecont': LaVoceCont,
'lavoce400': LaVoce400,
'fdmresdisc': FDMResDisc,
- 'lavoce400ar': LaVoce400AR
+ 'lavoce400ar': LaVoce400AR,
+ 'lavoce400ar2': LaVoce400AR2
}
diff --git a/dnn/torch/osce/models/lavoce_400_ar.py b/dnn/torch/osce/models/lavoce_400_ar.py
index 4bbb6a17..9955de4d 100644
--- a/dnn/torch/osce/models/lavoce_400_ar.py
+++ b/dnn/torch/osce/models/lavoce_400_ar.py
@@ -103,9 +103,9 @@ class LaVoce400AR(nn.Module):
self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
# non-linear transforms
- self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True)
- self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k)
- self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k)
+ self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True, kernel_size=4)
+ self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=4)
+ self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=4)
# combinators
self.af2 = LimitedAdaptiveConv1d(2, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
@@ -226,7 +226,15 @@ class LaVoce400AR(nn.Module):
frames = []
- for i in range(num_frames):
+ # pre-load buffer
+ if signal is not None:
+ frames = [signal[:, :, :4*self.FRAME_SIZE]]
+ with torch.no_grad():
+ for i in range(4):
+ y, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True)
+ last_frame = signal[:, :, i * self.FRAME_SIZE : (i + 1) * self.FRAME_SIZE]
+
+ for i in range(4 if nb_pre_frames > 0 else 0, num_frames):
y, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True)
y = torch.cat((y, prior[..., i * self.FRAME_SIZE : (i+1) * self.FRAME_SIZE]), dim=1)
y, state_af_mix = self.af_mix(y, cf[:, i:i+1], state=state_af_mix, return_state=True)
diff --git a/dnn/torch/osce/models/lavoce_400_ar2.py b/dnn/torch/osce/models/lavoce_400_ar2.py
new file mode 100644
index 00000000..73487831
--- /dev/null
+++ b/dnn/torch/osce/models/lavoce_400_ar2.py
@@ -0,0 +1,295 @@
+"""
+/* Copyright (c) 2023 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+import numpy as np
+
+from utils.layers.limited_adaptive_comb1d import LimitedAdaptiveComb1d
+from utils.layers.limited_adaptive_conv1d import LimitedAdaptiveConv1d
+from utils.layers.ar_filter import ARFilter
+from utils.layers.td_shaper import TDShaper
+from utils.layers.noise_shaper import NoiseShaper
+from utils.complexity import _conv1d_flop_count
+from utils.endoscopy import write_data
+
+from models.nns_base import NNSBase
+from models.lpcnet_feature_net import LPCNetFeatureNet
+from .scale_embedding import ScaleEmbedding
+
+class LaVoce400AR2(nn.Module):
+ """ Linear-Adaptive VOCodEr """
+ FEATURE_FRAME_SIZE=160
+ FRAME_SIZE=40
+
+ def __init__(self,
+ num_features=20,
+ pitch_embedding_dim=64,
+ cond_dim=256,
+ pitch_max=300,
+ kernel_size=15,
+ preemph=0.85,
+ comb_gain_limit_db=-6,
+ global_gain_limits_db=[-6, 6],
+ conv_gain_limits_db=[-6, 6],
+ norm_p=2,
+ avg_pool_k=4,
+ pulses=False,
+ tdshape_kernel_size=4):
+
+ super().__init__()
+
+
+ self.num_features = num_features
+ self.cond_dim = cond_dim
+ self.pitch_max = pitch_max
+ self.pitch_embedding_dim = pitch_embedding_dim
+ self.kernel_size = kernel_size
+ self.preemph = preemph
+ self.pulses = pulses
+
+ assert self.FEATURE_FRAME_SIZE % self.FRAME_SIZE == 0
+ self.upsamp_factor = self.FEATURE_FRAME_SIZE // self.FRAME_SIZE
+
+ # pitch embedding
+ self.pitch_embedding = nn.Embedding(pitch_max + 1, pitch_embedding_dim)
+
+ # feature net
+ self.feature_net = LPCNetFeatureNet(num_features + pitch_embedding_dim, cond_dim, self.upsamp_factor)
+
+ # noise shaper
+ self.noise_shaper = NoiseShaper(cond_dim, self.FRAME_SIZE)
+
+ # comb filters
+ left_pad = self.kernel_size // 2
+ right_pad = self.kernel_size - 1 - left_pad
+ self.cf1 = LimitedAdaptiveComb1d(self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, use_bias=False, padding=[left_pad, right_pad], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, global_gain_limits_db=global_gain_limits_db, norm_p=norm_p)
+ self.cf2 = LimitedAdaptiveComb1d(self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, use_bias=False, padding=[left_pad, right_pad], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, global_gain_limits_db=global_gain_limits_db, norm_p=norm_p)
+
+ self.cf_ar = ARFilter(5, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, padding=[2, 2], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, norm_p=norm_p)
+
+ self.af_prescale = LimitedAdaptiveConv1d(2, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+ self.af_mix = LimitedAdaptiveConv1d(3, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+
+ # spectral shaping
+ self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+
+ # non-linear transforms
+ self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True, kernel_size=tdshape_kernel_size, tanh_activation=True)
+ self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=tdshape_kernel_size, tanh_activation=True)
+ self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, kernel_size=tdshape_kernel_size, tanh_activation=True)
+
+ # combinators
+ self.af2 = LimitedAdaptiveConv1d(3, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+ self.af3 = LimitedAdaptiveConv1d(3, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+ self.af4 = LimitedAdaptiveConv1d(3, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+
+ # feature transforms
+ self.post_cf1 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_cf2 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_af1 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_af2 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_af3 = nn.Conv1d(cond_dim, cond_dim, 2)
+
+
+ def create_phase_signals(self, periods):
+
+ batch_size = periods.size(0)
+ progression = torch.arange(1, self.FRAME_SIZE + 1, dtype=periods.dtype, device=periods.device).view((1, -1))
+ progression = torch.repeat_interleave(progression, batch_size, 0)
+
+ phase0 = torch.zeros(batch_size, dtype=periods.dtype, device=periods.device).unsqueeze(-1)
+ chunks = []
+ for sframe in range(periods.size(1)):
+ f = (2.0 * torch.pi / periods[:, sframe]).unsqueeze(-1)
+
+ if self.pulses:
+ alpha = torch.cos(f).view(batch_size, 1, 1)
+ chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE)
+ pulse_a = torch.relu(chunk_sin - alpha) / (1 - alpha)
+ pulse_b = torch.relu(-chunk_sin - alpha) / (1 - alpha)
+
+ chunk = torch.cat((pulse_a, pulse_b), dim = 1)
+ else:
+ chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE)
+ chunk_cos = torch.cos(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE)
+
+ chunk = torch.cat((chunk_sin, chunk_cos), dim = 1)
+
+ phase0 = phase0 + self.FRAME_SIZE * f
+
+ chunks.append(chunk)
+
+ phase_signals = torch.cat(chunks, dim=-1)
+
+ return phase_signals
+
+ def flop_count(self, rate=16000, verbose=False):
+
+ frame_rate = rate / self.FRAME_SIZE
+
+ # feature net
+ feature_net_flops = self.feature_net.flop_count(frame_rate)
+ comb_flops = self.cf1.flop_count(rate) + self.cf2.flop_count(rate) + self.cf_ar.flop_count(rate)
+ af_flops = self.af1.flop_count(rate) + self.af2.flop_count(rate) + self.af3.flop_count(rate) + self.af4.flop_count(rate) + self.af_mix.flop_count(rate) + self.af_prescale.flop_count(rate)
+ feature_flops = (_conv1d_flop_count(self.post_cf1, frame_rate) + _conv1d_flop_count(self.post_cf2, frame_rate)
+ + _conv1d_flop_count(self.post_af1, frame_rate) + _conv1d_flop_count(self.post_af2, frame_rate) + _conv1d_flop_count(self.post_af3, frame_rate))
+
+ if verbose:
+ print(f"feature net: {feature_net_flops / 1e6} MFLOPS")
+ print(f"comb filters: {comb_flops / 1e6} MFLOPS")
+ print(f"adaptive conv: {af_flops / 1e6} MFLOPS")
+ print(f"feature transforms: {feature_flops / 1e6} MFLOPS")
+
+ return feature_net_flops + comb_flops + af_flops + feature_flops
+
+ def feature_transform(self, f, layer):
+ f = f.permute(0, 2, 1)
+ f = F.pad(f, [1, 0])
+ f = torch.tanh(layer(f))
+ return f.permute(0, 2, 1)
+
+ def forward(self, features, periods, signal=None, debug=False):
+
+ periods = periods.squeeze(-1)
+ pitch_embedding = self.pitch_embedding(periods)
+
+ if signal is not None:
+ nb_pre_frames = signal.size(-1) // self.FRAME_SIZE
+ if len(signal.shape) < 3:
+ signal = signal.unsqueeze(1)
+ else:
+ nb_pre_frames = 0
+
+ full_features = torch.cat((features, pitch_embedding), dim=-1)
+ cf = self.feature_net(full_features)
+ cf1 = self.feature_transform(cf, self.post_af2)
+ cf2= self.feature_transform(cf1, self.post_af3)
+ cf3 = self.feature_transform(cf2, self.post_cf1)
+ cf4 = self.feature_transform(cf3, self.post_cf2)
+ cf5 = self.feature_transform(cf4, self.post_af1)
+
+
+ # upsample periods
+ periods = torch.repeat_interleave(periods, self.upsamp_factor, 1)
+ periods_ar = torch.where(periods > 42, periods, 2*periods)
+
+ num_frames = periods.size(1)
+
+ # pre-net
+ ref_phase = torch.tanh(self.create_phase_signals(periods))
+ x = self.af_prescale(ref_phase, cf)
+ noise = self.noise_shaper(cf)
+ prior = torch.cat((x, noise), dim=1)
+
+ # states
+ state_cf_ar = None
+ state_af_mix = None
+ state_tdshape1 = None
+ state_tdshape2 = None
+ state_cf1 = None
+ state_cf2 = None
+ state_af1 = None
+ state_af2 = None
+ state_af3 = None
+ state_tdshape3 = None
+ state_af4 = None
+ last_frame = torch.zeros((features.size(0), 1, self.FRAME_SIZE), device=features.device)
+
+ frames = []
+
+ # pre-load buffer
+ if signal is not None:
+ frames = [signal[:, :, :4*self.FRAME_SIZE]]
+ with torch.no_grad():
+ for i in range(4):
+ y, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True)
+ last_frame = signal[:, :, i * self.FRAME_SIZE : (i + 1) * self.FRAME_SIZE]
+
+
+ for i in range(4 if nb_pre_frames > 0 else 0, num_frames):
+ pred, state_cf_ar = self.cf_ar(last_frame, cf[:, i:i+1], periods_ar[:, i:i+1], state=state_cf_ar, return_state=True)
+ y = torch.cat((pred, prior[..., i * self.FRAME_SIZE : (i+1) * self.FRAME_SIZE]), dim=1)
+ y, state_af_mix = self.af_mix(y, cf[:, i:i+1], state=state_af_mix, return_state=True)
+
+ # temporal shaping + innovating
+ y1 = y[:, 0:1, :]
+ y2, state_tdshape1 = self.tdshape1(y[:, 1:2, :], cf[:, i:i+1], state=state_tdshape1, return_state=True)
+ y = torch.cat((y1, y2, pred), dim=1)
+ y, state_af2 = self.af2(y, cf[:, i:i+1], state=state_af2, return_state=True, debug=debug)
+
+ # second temporal shaping
+ y1 = y[:, 0:1, :]
+ y2, state_tdshape2 = self.tdshape2(y[:, 1:2, :], cf1[:, i:i+1], state=state_tdshape2, return_state=True)
+ y = torch.cat((y1, y2, pred), dim=1)
+ y, state_af3 = self.af3(y, cf1[:, i:i+1], state=state_af3, return_state=True, debug=debug)
+
+ # spectral shaping
+ y, state_cf1 = self.cf1(y, cf2[:, i:i+1], periods[:, i:i+1], state=state_cf1, return_state=True, debug=debug)
+ y, state_cf2 = self.cf2(y, cf3[:, i:i+1], periods[:, i:i+1], state=state_cf2, return_state=True, debug=debug)
+ y, state_af1 = self.af1(y, cf4[:, i:i+1], state=state_af1, return_state=True, debug=debug)
+
+ # final temporal env adjustment
+ y1 = y[:, 0:1, :]
+ y2, state_tdshape3 = self.tdshape3(y[:, 1:2, :], cf5[:, i:i+1], state=state_tdshape3, return_state=True)
+ y = torch.cat((y1, y2, pred), dim=1)
+ y, state_af4 = self.af4(y, cf5[:, i:i+1], state=state_af4, return_state=True, debug=debug)
+
+ if i < nb_pre_frames:
+ y = signal[:, :, i * self.FRAME_SIZE : (i + 1) * self.FRAME_SIZE]
+
+ last_frame = y
+ frames.append(y)
+
+ return torch.cat(frames, dim=-1)
+
+ def process(self, features, periods, debug=False):
+
+ self.eval()
+ device = next(iter(self.parameters())).device
+ with torch.no_grad():
+
+ # run model
+ f = features.unsqueeze(0).to(device)
+ p = periods.unsqueeze(0).to(device)
+
+ y = self.forward(f, p, debug=debug).squeeze()
+
+ # deemphasis
+ if self.preemph > 0:
+ for i in range(len(y) - 1):
+ y[i + 1] += self.preemph * y[i]
+
+ # clip to valid range
+ out = torch.clip((2**15) * y, -2**15, 2**15 - 1).short()
+
+ return out \ No newline at end of file
diff --git a/dnn/torch/osce/test_vocoder.py b/dnn/torch/osce/test_vocoder.py
index e71a5c37..55e5d00c 100644
--- a/dnn/torch/osce/test_vocoder.py
+++ b/dnn/torch/osce/test_vocoder.py
@@ -55,6 +55,7 @@ else:
parser.add_argument('checkpoint', type=str, help='checkpoint file')
parser.add_argument('output', type=str, help='output file')
parser.add_argument('--debug', action='store_true', help='enables debug output')
+ parser.add_argument('--feature-version', type=int, help='feature version, default: 3', default=3)
args = parser.parse_args()
@@ -85,7 +86,7 @@ model.load_state_dict(checkpoint['state_dict'])
# generate model input
setup = checkpoint['setup']
-testdata = load_lpcnet_features(input_folder)
+testdata = load_lpcnet_features(input_folder, version=args.feature_version)
features = testdata['features']
periods = testdata['periods']
diff --git a/dnn/torch/osce/train_vocoder.py b/dnn/torch/osce/train_vocoder.py
index 3572e962..ce6ef48c 100644
--- a/dnn/torch/osce/train_vocoder.py
+++ b/dnn/torch/osce/train_vocoder.py
@@ -126,16 +126,6 @@ if has_git:
with open(os.path.join(args.output, setup_name), 'w') as f:
yaml.dump(setup, f)
-ref = None
-# prepare inference test if wanted
-inference_test = False
-if type(args.test_features) != type(None):
- test_features = load_lpcnet_features(args.test_features)
- features = test_features['features']
- periods = test_features['periods']
- inference_folder = os.path.join(args.output, 'inference_test')
- os.makedirs(inference_folder, exist_ok=True)
- inference_test = True
# training parameters
@@ -161,6 +151,18 @@ if 'validation_dataset' in setup:
else:
run_validation = False
+ref = None
+# prepare inference test if wanted
+inference_test = False
+if type(args.test_features) != type(None):
+ test_features = load_lpcnet_features(args.test_features, version=data.version)
+ features = test_features['features']
+ periods = test_features['periods']
+ inference_folder = os.path.join(args.output, 'inference_test')
+ os.makedirs(inference_folder, exist_ok=True)
+ inference_test = True
+
+
# create model
model = model_dict[model_name](*setup['model']['args'], **setup['model']['kwargs'])
diff --git a/dnn/torch/osce/utils/layers/td_shaper.py b/dnn/torch/osce/utils/layers/td_shaper.py
index 100a6cff..7bbfa514 100644
--- a/dnn/torch/osce/utils/layers/td_shaper.py
+++ b/dnn/torch/osce/utils/layers/td_shaper.py
@@ -12,7 +12,9 @@ class TDShaper(nn.Module):
frame_size=160,
avg_pool_k=4,
innovate=False,
- pool_after=False
+ pool_after=False,
+ kernel_size=2,
+ tanh_activation=False,
):
"""
@@ -36,25 +38,29 @@ class TDShaper(nn.Module):
super().__init__()
- self.feature_dim = feature_dim
- self.frame_size = frame_size
- self.avg_pool_k = avg_pool_k
- self.innovate = innovate
- self.pool_after = pool_after
+ self.feature_dim = feature_dim
+ self.frame_size = frame_size
+ self.avg_pool_k = avg_pool_k
+ self.innovate = innovate
+ self.pool_after = pool_after
+ self.kernel_size = kernel_size
+ self.tanh_activation = tanh_activation
assert frame_size % avg_pool_k == 0
self.env_dim = frame_size // avg_pool_k + 1
# feature transform
- self.feature_alpha1 = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, 2)
- self.feature_alpha2 = nn.Conv1d(frame_size, frame_size, 2)
+ self.feature_alpha1 = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, kernel_size)
+ self.feature_alpha2 = nn.Conv1d(frame_size, frame_size, kernel_size)
if self.innovate:
- self.feature_alpha1b = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, 2)
- self.feature_alpha1c = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, 2)
+ self.feature_alpha1b = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, kernel_size)
+ self.feature_alpha1c = nn.Conv1d(self.feature_dim + self.env_dim, frame_size, kernel_size)
- self.feature_alpha2b = nn.Conv1d(frame_size, frame_size, 2)
- self.feature_alpha2c = nn.Conv1d(frame_size, frame_size, 2)
+ self.feature_alpha2b = nn.Conv1d(frame_size, frame_size, kernel_size)
+ self.feature_alpha2c = nn.Conv1d(frame_size, frame_size, kernel_size)
+
+ self.activation = torch.tanh if self.tanh_activation else torch.nn.LeakyReLU(0.2)
def flop_count(self, rate):
@@ -105,6 +111,7 @@ class TDShaper(nn.Module):
batch_size = x.size(0)
num_frames = features.size(1)
num_samples = x.size(2)
+ padding = 2 * self.kernel_size - 2
# generate temporal envelope
tenv = self.envelope_transform(x)
@@ -114,17 +121,17 @@ class TDShaper(nn.Module):
if state is not None:
f = torch.cat((state, f), dim=-1)
else:
- f = F.pad(f, [2, 0])
- alpha = F.leaky_relu(self.feature_alpha1(f), 0.2)
+ f = F.pad(f, [padding, 0])
+ alpha = self.activation(self.feature_alpha1(f))
alpha = torch.exp(self.feature_alpha2(alpha))
alpha = alpha.permute(0, 2, 1)
if self.innovate:
- inno_alpha = F.leaky_relu(self.feature_alpha1b(f), 0.2)
+ inno_alpha = self.activation(self.feature_alpha1b(f))
inno_alpha = torch.exp(self.feature_alpha2b(inno_alpha))
inno_alpha = inno_alpha.permute(0, 2, 1)
- inno_x = F.leaky_relu(self.feature_alpha1c(f), 0.2)
+ inno_x = self.activation(self.feature_alpha1c(f))
inno_x = torch.tanh(self.feature_alpha2c(inno_x))
inno_x = inno_x.permute(0, 2, 1)
@@ -138,7 +145,7 @@ class TDShaper(nn.Module):
y = y.reshape(batch_size, 1, num_samples)
if return_state:
- new_state = f[..., -2:]
+ new_state = f[..., -padding:]
return y, new_state
else:
return y
diff --git a/dnn/torch/osce/utils/lpcnet_features.py b/dnn/torch/osce/utils/lpcnet_features.py
index 3d109fd3..5125497e 100644
--- a/dnn/torch/osce/utils/lpcnet_features.py
+++ b/dnn/torch/osce/utils/lpcnet_features.py
@@ -4,7 +4,7 @@ import torch
import numpy as np
def load_lpcnet_features(feature_file, version=2):
- if version == 2:
+ if version == 2 or version == 3:
layout = {
'cepstrum': [0,18],
'periods': [18, 19],
@@ -37,7 +37,10 @@ def load_lpcnet_features(feature_file, version=2):
)
lpcs = raw_features[:, layout['lpc'][0] : layout['lpc'][1]]
- periods = (0.1 + 50 * raw_features[:, layout['periods'][0] : layout['periods'][1]] + 100).long()
+ if version < 3:
+ periods = (0.1 + 50 * raw_features[:, layout['periods'][0] : layout['periods'][1]] + 100).long()
+ else:
+ periods = torch.round(torch.clip(256./2**(raw_features[:, layout['periods'][0] : layout['periods'][1]] + 1.5), 32, 256)).long()
return {'features' : features, 'periods' : periods, 'lpcs' : lpcs}