PyTorch Virtual Batch Normalization and Speech Generator Implementation
import torch
import torch.nn as nn
from torch.nn.modules import Module
from torch.nn.parameter import Parameter
class VirtualBatchNorm1d(Module):
'Module for Virtual Batch Normalization.
Implementation borrowed and modified from Rafael_Valle's code + help of SimonW from this discussion thread:
https://discuss.pytorch.org/t/parameter-grad-of-conv-weight-is-none-after-virtual-batch-normalization/9036
'
def __init__(self, num_features, eps=1e-5):
super().__init__()
# batch statistics
self.num_features = num_features
self.eps = eps # epsilon
# define gamma and beta parameters
self.gamma = Parameter(torch.normal(mean=1.0, std=0.02, size=(1, num_features, 1)))
self.beta = Parameter(torch.zeros(1, num_features, 1))
def get_stats(self, x):
'Calculates mean and mean square for given batch x.
Args:
x: tensor containing batch of activations
Returns:
mean: mean tensor over features
mean_sq: squared mean tensor over features
'
mean = x.mean(2, keepdim=True).mean(0, keepdim=True)
mean_sq = (x ** 2).mean(2, keepdim=True).mean(0, keepdim=True)
return mean, mean_sq
def forward(self, x, ref_mean, ref_mean_sq):
'Forward pass of virtual batch normalization.
Virtual batch normalization require two forward passes
for reference batch and train batch, respectively.
Args:
x: input tensor
ref_mean: reference mean tensor over features
ref_mean_sq: reference squared mean tensor over features
Result:
x: normalized batch tensor
ref_mean: reference mean tensor over features
ref_mean_sq: reference squared mean tensor over features
'
mean, mean_sq = self.get_stats(x)
if ref_mean is None or ref_mean_sq is None:
# reference mode - works just like batch norm
mean = mean.clone().detach()
mean_sq = mean_sq.clone().detach()
out = self.normalize(x, mean, mean_sq)
else:
# calculate new mean and mean_sq
batch_size = x.size(0)
new_coeff = 1. / (batch_size + 1.)
old_coeff = 1. - new_coeff
mean = new_coeff * mean + old_coeff * ref_mean
mean_sq = new_coeff * mean_sq + old_coeff * ref_mean_sq
out = self.normalize(x, mean, mean_sq)
return out, mean, mean_sq
def normalize(self, x, mean, mean_sq):
'Normalize tensor x given the statistics.
Args:
x: input tensor
mean: mean over features
mean_sq: squared means over features
Result:
x: normalized batch tensor
'
assert mean_sq is not None
assert mean is not None
assert len(x.size()) == 3 # specific for 1d VBN
if mean.size(1) != self.num_features:
raise Exception('Mean tensor size not equal to number of features : given {}, expected {}'
.format(mean.size(1), self.num_features))
if mean_sq.size(1) != self.num_features:
raise Exception('Squared mean tensor size not equal to number of features : given {}, expected {}'
.format(mean_sq.size(1), self.num_features))
std = torch.sqrt(self.eps + mean_sq - mean ** 2)
x = x - mean
x = x / std
x = x * self.gamma
x = x + self.beta
return x
def __repr__(self):
return ('{name}(num_features={num_features}, eps={eps}'
.format(name=self.__class__.__name__, **self.__dict__))
class Generator(nn.Module):
'G'
def __init__(self):
super().__init__()
# encoder gets a noisy signal as input [B x 1 x 16384]
self.enc1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=32, stride=2, padding=15) # [B x 16 x 8192]
self.enc1_nl = nn.PReLU()
self.enc2 = nn.Conv1d(16, 32, 32, 2, 15) # [B x 32 x 4096]
self.enc2_nl = nn.PReLU()
self.enc3 = nn.Conv1d(32, 32, 32, 2, 15) # [B x 32 x 2048]
self.enc3_nl = nn.PReLU()
self.enc4 = nn.Conv1d(32, 64, 32, 2, 15) # [B x 64 x 1024]
self.enc4_nl = nn.PReLU()
self.enc5 = nn.Conv1d(64, 64, 32, 2, 15) # [B x 64 x 512]
self.enc5_nl = nn.PReLU()
self.enc6 = nn.Conv1d(64, 128, 32, 2, 15) # [B x 128 x 256]
self.enc6_nl = nn.PReLU()
self.enc7 = nn.Conv1d(128, 128, 32, 2, 15) # [B x 128 x 128]
self.enc7_nl = nn.PReLU()
self.enc8 = nn.Conv1d(128, 256, 32, 2, 15) # [B x 256 x 64]
self.enc8_nl = nn.PReLU()
# Model details:
# This is an implementation of Virtual Batch Normalization (VBN) in PyTorch, along with a generator model for speech processing.
# Virtual Batch Normalization is a variation of batch normalization that normalizes the current batch using statistics from a reference batch. This helps to smooth the training process and makes the generator produce more consistent outputs when handling different inputs. In this implementation, VirtualBatchNorm1d inherits from PyTorch's Module class, containing two learnable parameters gamma and beta, and some methods for calculating the mean and variance.
# The generator model is a convolutional neural network based model used to generate speech signals from noise signals. It contains 8 convolutional layers, each with a convolutional operation and a PReLU activation function. The model takes a 1-dimensional noise signal as input, and after several convolutional and activation operations, outputs a 1-dimensional speech signal.
# You can adapt and extend this model for your specific speech generation tasks by adjusting the convolutional layers, activation functions, and other hyperparameters.
# For instance, you can add more convolutional layers or use different activation functions like ReLU or tanh. You can also modify the kernel size, stride, and padding to control the receptive field of the convolutional layers.
# Remember to experiment with different hyperparameter settings to find the best performance for your application.
# This code provides a basic framework for implementing a speech generator using Virtual Batch Normalization. You can build upon this foundation and explore various approaches to improve its performance and functionality.
原文地址: https://www.cveoy.top/t/topic/nrpb 著作权归作者所有。请勿转载和采集!