Welcome to mirror list, hosted at ThFree Co, Russian Federation.

gitlab.xiph.org/xiph/opus.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Buethe <jbuethe@amazon.de>2023-09-21 16:01:11 +0300
committerJan Buethe <jbuethe@amazon.de>2023-09-21 16:01:11 +0300
commit0a92bc5eaa6467d63efbed0b5ff625db64be5629 (patch)
tree58c3436a38a93255e5c0387d954daa0d954b526f
parent52c15629eef8e1d913ce67c1b46f27301854b05d (diff)
more lavoce stuff
-rw-r--r--dnn/torch/osce/models/__init__.py2
-rw-r--r--dnn/torch/osce/models/lavoce.py53
-rw-r--r--dnn/torch/osce/models/lavoce_400.py254
3 files changed, 292 insertions, 17 deletions
diff --git a/dnn/torch/osce/models/__init__.py b/dnn/torch/osce/models/__init__.py
index c7857349..859db033 100644
--- a/dnn/torch/osce/models/__init__.py
+++ b/dnn/torch/osce/models/__init__.py
@@ -30,11 +30,13 @@
from .lace import LACE
from .no_lace import NoLACE
from .lavoce import LaVoce
+from .lavoce_400 import LaVoce400
from .fd_discriminator import TFDMultiResolutionDiscriminator as FDMResDisc
model_dict = {
'lace': LACE,
'nolace': NoLACE,
'lavoce': LaVoce,
+ 'lavoce400': LaVoce400,
'fdmresdisc': FDMResDisc,
}
diff --git a/dnn/torch/osce/models/lavoce.py b/dnn/torch/osce/models/lavoce.py
index 1a9dc871..795246b9 100644
--- a/dnn/torch/osce/models/lavoce.py
+++ b/dnn/torch/osce/models/lavoce.py
@@ -45,6 +45,17 @@ from models.nns_base import NNSBase
from models.lpcnet_feature_net import LPCNetFeatureNet
from .scale_embedding import ScaleEmbedding
+def print_channels(y, prefix="", name="", rate=16000):
+ num_channels = y.size(1)
+ for i in range(num_channels):
+ channel_name = f"{prefix}_c{i:02d}"
+ if len(name) > 0: channel_name += "_" + name
+ ch = y[0,i,:].detach().cpu().numpy()
+ ch = ((2**14) * ch / np.max(ch)).astype(np.int16)
+ write_data(channel_name, ch, rate)
+
+
+
class LaVoce(nn.Module):
""" Linear-Adaptive VOCodEr """
FEATURE_FRAME_SIZE=160
@@ -62,7 +73,11 @@ class LaVoce(nn.Module):
conv_gain_limits_db=[-6, 6],
norm_p=2,
avg_pool_k=4,
- pulses=False):
+ pulses=False,
+ innovate1=True,
+ innovate2=False,
+ innovate3=False,
+ ftrans_k=2):
super().__init__()
@@ -101,9 +116,9 @@ class LaVoce(nn.Module):
self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
# non-linear transforms
- self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True)
- self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k)
- self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k)
+ self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate1)
+ self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate2)
+ self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=innovate3)
# combinators
self.af2 = LimitedAdaptiveConv1d(2, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
@@ -111,11 +126,11 @@ class LaVoce(nn.Module):
self.af4 = LimitedAdaptiveConv1d(2, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
# feature transforms
- self.post_cf1 = nn.Conv1d(cond_dim, cond_dim, 2)
- self.post_cf2 = nn.Conv1d(cond_dim, cond_dim, 2)
- self.post_af1 = nn.Conv1d(cond_dim, cond_dim, 2)
- self.post_af2 = nn.Conv1d(cond_dim, cond_dim, 2)
- self.post_af3 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_cf1 = nn.Conv1d(cond_dim, cond_dim, ftrans_k)
+ self.post_cf2 = nn.Conv1d(cond_dim, cond_dim, ftrans_k)
+ self.post_af1 = nn.Conv1d(cond_dim, cond_dim, ftrans_k)
+ self.post_af2 = nn.Conv1d(cond_dim, cond_dim, ftrans_k)
+ self.post_af3 = nn.Conv1d(cond_dim, cond_dim, ftrans_k)
def create_phase_signals(self, periods, pulses=False):
@@ -188,46 +203,50 @@ class LaVoce(nn.Module):
# pre-net
ref_phase = torch.tanh(self.create_phase_signals(periods))
+ if debug: print_channels(ref_phase, prefix="lavoce_01", name="pulse")
x = self.af_prescale(ref_phase, cf)
noise = self.noise_shaper(cf)
+ if debug: print_channels(torch.cat((x, noise), dim=1), prefix="lavoce_02", name="inputs")
y = self.af_mix(torch.cat((x, noise), dim=1), cf)
-
- if debug:
- ch0 = y[0,0,:].detach().cpu().numpy()
- ch1 = y[0,1,:].detach().cpu().numpy()
- ch0 = (2**15 * ch0 / np.max(ch0)).astype(np.int16)
- ch1 = (2**15 * ch1 / np.max(ch1)).astype(np.int16)
- write_data('prior_channel0', ch0, 16000)
- write_data('prior_channel1', ch1, 16000)
+ if debug: print_channels(y, prefix="lavoce_03", name="postselect1")
# temporal shaping + innovating
y1 = y[:, 0:1, :]
y2 = self.tdshape1(y[:, 1:2, :], cf)
+ if debug: print_channels(y2, prefix="lavoce_04", name="postshape1")
y = torch.cat((y1, y2), dim=1)
y = self.af2(y, cf, debug=debug)
+ if debug: print_channels(y, prefix="lavoce_05", name="postselect2")
cf = self.feature_transform(cf, self.post_af2)
y1 = y[:, 0:1, :]
y2 = self.tdshape2(y[:, 1:2, :], cf)
+ if debug: print_channels(y2, prefix="lavoce_06", name="postshape2")
y = torch.cat((y1, y2), dim=1)
y = self.af3(y, cf, debug=debug)
+ if debug: print_channels(y, prefix="lavoce_07", name="postmix1")
cf = self.feature_transform(cf, self.post_af3)
# spectral shaping
y = self.cf1(y, cf, periods, debug=debug)
+ if debug: print_channels(y, prefix="lavoce_08", name="postcomb1")
cf = self.feature_transform(cf, self.post_cf1)
y = self.cf2(y, cf, periods, debug=debug)
+ if debug: print_channels(y, prefix="lavoce_09", name="postcomb2")
cf = self.feature_transform(cf, self.post_cf2)
y = self.af1(y, cf, debug=debug)
+ if debug: print_channels(y, prefix="lavoce_10", name="postselect3")
cf = self.feature_transform(cf, self.post_af1)
# final temporal env adjustment
y1 = y[:, 0:1, :]
y2 = self.tdshape3(y[:, 1:2, :], cf)
+ if debug: print_channels(y2, prefix="lavoce_11", name="postshape3")
y = torch.cat((y1, y2), dim=1)
y = self.af4(y, cf, debug=debug)
+ if debug: print_channels(y, prefix="lavoce_12", name="postmix2")
return y
diff --git a/dnn/torch/osce/models/lavoce_400.py b/dnn/torch/osce/models/lavoce_400.py
new file mode 100644
index 00000000..ab7724df
--- /dev/null
+++ b/dnn/torch/osce/models/lavoce_400.py
@@ -0,0 +1,254 @@
+"""
+/* Copyright (c) 2023 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+import numpy as np
+
+from utils.layers.limited_adaptive_comb1d import LimitedAdaptiveComb1d
+from utils.layers.limited_adaptive_conv1d import LimitedAdaptiveConv1d
+from utils.layers.td_shaper import TDShaper
+from utils.layers.noise_shaper import NoiseShaper
+from utils.complexity import _conv1d_flop_count
+from utils.endoscopy import write_data
+
+from models.nns_base import NNSBase
+from models.lpcnet_feature_net import LPCNetFeatureNet
+from .scale_embedding import ScaleEmbedding
+
+class LaVoce400(nn.Module):
+ """ Linear-Adaptive VOCodEr """
+ FEATURE_FRAME_SIZE=160
+ FRAME_SIZE=40
+
+ def __init__(self,
+ num_features=20,
+ pitch_embedding_dim=64,
+ cond_dim=256,
+ pitch_max=300,
+ kernel_size=15,
+ preemph=0.85,
+ comb_gain_limit_db=-6,
+ global_gain_limits_db=[-6, 6],
+ conv_gain_limits_db=[-6, 6],
+ norm_p=2,
+ avg_pool_k=4,
+ pulses=False):
+
+ super().__init__()
+
+
+ self.num_features = num_features
+ self.cond_dim = cond_dim
+ self.pitch_max = pitch_max
+ self.pitch_embedding_dim = pitch_embedding_dim
+ self.kernel_size = kernel_size
+ self.preemph = preemph
+ self.pulses = pulses
+
+ assert self.FEATURE_FRAME_SIZE % self.FRAME_SIZE == 0
+ self.upsamp_factor = self.FEATURE_FRAME_SIZE // self.FRAME_SIZE
+
+ # pitch embedding
+ self.pitch_embedding = nn.Embedding(pitch_max + 1, pitch_embedding_dim)
+
+ # feature net
+ self.feature_net = LPCNetFeatureNet(num_features + pitch_embedding_dim, cond_dim, self.upsamp_factor)
+
+ # noise shaper
+ self.noise_shaper = NoiseShaper(cond_dim, self.FRAME_SIZE)
+
+ # comb filters
+ left_pad = self.kernel_size // 2
+ right_pad = self.kernel_size - 1 - left_pad
+ self.cf1 = LimitedAdaptiveComb1d(self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, use_bias=False, padding=[left_pad, right_pad], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, global_gain_limits_db=global_gain_limits_db, norm_p=norm_p)
+ self.cf2 = LimitedAdaptiveComb1d(self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, overlap_size=20, use_bias=False, padding=[left_pad, right_pad], max_lag=pitch_max + 1, gain_limit_db=comb_gain_limit_db, global_gain_limits_db=global_gain_limits_db, norm_p=norm_p)
+
+
+ self.af_prescale = LimitedAdaptiveConv1d(2, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+ self.af_mix = LimitedAdaptiveConv1d(2, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+
+ # spectral shaping
+ self.af1 = LimitedAdaptiveConv1d(1, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+
+ # non-linear transforms
+ self.tdshape1 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k, innovate=True)
+ self.tdshape2 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k)
+ self.tdshape3 = TDShaper(cond_dim, frame_size=self.FRAME_SIZE, avg_pool_k=avg_pool_k)
+
+ # combinators
+ self.af2 = LimitedAdaptiveConv1d(2, 2, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+ self.af3 = LimitedAdaptiveConv1d(2, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+ self.af4 = LimitedAdaptiveConv1d(2, 1, self.kernel_size, cond_dim, frame_size=self.FRAME_SIZE, use_bias=False, padding=[self.kernel_size - 1, 0], gain_limits_db=conv_gain_limits_db, norm_p=norm_p)
+
+ # feature transforms
+ self.post_cf1 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_cf2 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_af1 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_af2 = nn.Conv1d(cond_dim, cond_dim, 2)
+ self.post_af3 = nn.Conv1d(cond_dim, cond_dim, 2)
+
+
+ def create_phase_signals(self, periods, pulses=False):
+
+ batch_size = periods.size(0)
+ progression = torch.arange(1, self.FRAME_SIZE + 1, dtype=periods.dtype, device=periods.device).view((1, -1))
+ progression = torch.repeat_interleave(progression, batch_size, 0)
+
+ phase0 = torch.zeros(batch_size, dtype=periods.dtype, device=periods.device).unsqueeze(-1)
+ chunks = []
+ for sframe in range(periods.size(1)):
+ f = (2.0 * torch.pi / periods[:, sframe]).unsqueeze(-1)
+
+ if pulses:
+ alpha = torch.cos(f)
+ chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE)
+ pulse_a = torch.relu(chunk_sin - alpha) / (1 - alpha)
+ pulse_b = torch.relu(-chunk_sin - alpha) / (1 - alpha)
+
+ chunk = torch.cat((pulse_a, pulse_b), dim = 1)
+ else:
+ chunk_sin = torch.sin(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE)
+ chunk_cos = torch.cos(f * progression + phase0).view(batch_size, 1, self.FRAME_SIZE)
+
+ chunk = torch.cat((chunk_sin, chunk_cos), dim = 1)
+
+ phase0 = phase0 + self.FRAME_SIZE * f
+
+ chunks.append(chunk)
+
+ phase_signals = torch.cat(chunks, dim=-1)
+
+ return phase_signals
+
+ def flop_count(self, rate=16000, verbose=False):
+
+ frame_rate = rate / self.FRAME_SIZE
+
+ # feature net
+ feature_net_flops = self.feature_net.flop_count(frame_rate)
+ comb_flops = self.cf1.flop_count(rate) + self.cf2.flop_count(rate)
+ af_flops = self.af1.flop_count(rate) + self.af2.flop_count(rate) + self.af3.flop_count(rate) + self.af4.flop_count(rate) + self.af_prescale.flop_count(rate) + self.af_mix.flop_count(rate)
+ feature_flops = (_conv1d_flop_count(self.post_cf1, frame_rate) + _conv1d_flop_count(self.post_cf2, frame_rate)
+ + _conv1d_flop_count(self.post_af1, frame_rate) + _conv1d_flop_count(self.post_af2, frame_rate) + _conv1d_flop_count(self.post_af3, frame_rate))
+
+ if verbose:
+ print(f"feature net: {feature_net_flops / 1e6} MFLOPS")
+ print(f"comb filters: {comb_flops / 1e6} MFLOPS")
+ print(f"adaptive conv: {af_flops / 1e6} MFLOPS")
+ print(f"feature transforms: {feature_flops / 1e6} MFLOPS")
+
+ return feature_net_flops + comb_flops + af_flops + feature_flops
+
+ def feature_transform(self, f, layer):
+ f = f.permute(0, 2, 1)
+ f = F.pad(f, [1, 0])
+ f = torch.tanh(layer(f))
+ return f.permute(0, 2, 1)
+
+ def forward(self, features, periods, debug=False):
+
+ periods = periods.squeeze(-1)
+ pitch_embedding = self.pitch_embedding(periods)
+
+ full_features = torch.cat((features, pitch_embedding), dim=-1)
+ cf = self.feature_net(full_features)
+
+ # upsample periods
+ periods = torch.repeat_interleave(periods, self.upsamp_factor, 1)
+
+ # pre-net
+ ref_phase = torch.tanh(self.create_phase_signals(periods))
+ x = self.af_prescale(ref_phase, cf)
+ noise = self.noise_shaper(cf)
+ y = self.af_mix(torch.cat((x, noise), dim=1), cf)
+
+ if debug:
+ ch0 = y[0,0,:].detach().cpu().numpy()
+ ch1 = y[0,1,:].detach().cpu().numpy()
+ ch0 = (2**15 * ch0 / np.max(ch0)).astype(np.int16)
+ ch1 = (2**15 * ch1 / np.max(ch1)).astype(np.int16)
+ write_data('prior_channel0', ch0, 16000)
+ write_data('prior_channel1', ch1, 16000)
+
+ # temporal shaping + innovating
+ y1 = y[:, 0:1, :]
+ y2 = self.tdshape1(y[:, 1:2, :], cf)
+ y = torch.cat((y1, y2), dim=1)
+ y = self.af2(y, cf, debug=debug)
+ cf = self.feature_transform(cf, self.post_af2)
+
+ y1 = y[:, 0:1, :]
+ y2 = self.tdshape2(y[:, 1:2, :], cf)
+ y = torch.cat((y1, y2), dim=1)
+ y = self.af3(y, cf, debug=debug)
+ cf = self.feature_transform(cf, self.post_af3)
+
+ # spectral shaping
+ y = self.cf1(y, cf, periods, debug=debug)
+ cf = self.feature_transform(cf, self.post_cf1)
+
+ y = self.cf2(y, cf, periods, debug=debug)
+ cf = self.feature_transform(cf, self.post_cf2)
+
+ y = self.af1(y, cf, debug=debug)
+ cf = self.feature_transform(cf, self.post_af1)
+
+ # final temporal env adjustment
+ y1 = y[:, 0:1, :]
+ y2 = self.tdshape3(y[:, 1:2, :], cf)
+ y = torch.cat((y1, y2), dim=1)
+ y = self.af4(y, cf, debug=debug)
+
+ return y
+
+ def process(self, features, periods, debug=False):
+
+ self.eval()
+ device = next(iter(self.parameters())).device
+ with torch.no_grad():
+
+ # run model
+ f = features.unsqueeze(0).to(device)
+ p = periods.unsqueeze(0).to(device)
+
+ y = self.forward(f, p, debug=debug).squeeze()
+
+ # deemphasis
+ if self.preemph > 0:
+ for i in range(len(y) - 1):
+ y[i + 1] += self.preemph * y[i]
+
+ # clip to valid range
+ out = torch.clip((2**15) * y, -2**15, 2**15 - 1).short()
+
+ return out \ No newline at end of file