Audio Enhancement Generator Model: Architecture and Implementation
class Generator(nn.Module):/n 'G'/n/n def init(self):/n super().init()/n # encoder gets a noisy signal as input [B x 1 x 16384]/n self.enc1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=32, stride=2, padding=15) # [B x 16 x 8192]/n self.enc1_nl = nn.PReLU()/n self.enc2 = nn.Conv1d(16, 32, 32, 2, 15) # [B x 32 x 4096]/n self.enc2_nl = nn.PReLU()/n self.enc3 = nn.Conv1d(32, 32, 32, 2, 15) # [B x 32 x 2048]/n self.enc3_nl = nn.PReLU()/n self.enc4 = nn.Conv1d(32, 64, 32, 2, 15) # [B x 64 x 1024]/n self.enc4_nl = nn.PReLU()/n self.enc5 = nn.Conv1d(64, 64, 32, 2, 15) # [B x 64 x 512]/n self.enc5_nl = nn.PReLU()/n self.enc6 = nn.Conv1d(64, 128, 32, 2, 15) # [B x 128 x 256]/n self.enc6_nl = nn.PReLU()/n self.enc7 = nn.Conv1d(128, 128, 32, 2, 15) # [B x 128 x 128]/n self.enc7_nl = nn.PReLU()/n self.enc8 = nn.Conv1d(128, 256, 32, 2, 15) # [B x 256 x 64]/n self.enc8_nl = nn.PReLU()/n self.enc9 = nn.Conv1d(256, 256, 32, 2, 15) # [B x 256 x 32]/n self.enc9_nl = nn.PReLU()/n self.enc10 = nn.Conv1d(256, 512, 32, 2, 15) # [B x 512 x 16]/n self.enc10_nl = nn.PReLU()/n self.enc11 = nn.Conv1d(512, 1024, 32, 2, 15) # [B x 1024 x 8]/n self.enc11_nl = nn.PReLU()/n/n # decoder generates an enhanced signal/n # each decoder output are concatenated with homologous encoder output,/n # so the feature map sizes are doubled/n self.dec10 = nn.ConvTranspose1d(in_channels=2048, out_channels=512, kernel_size=32, stride=2, padding=15)/n self.dec10_nl = nn.PReLU() # out : [B x 512 x 16] -> (concat) [B x 1024 x 16]/n self.dec9 = nn.ConvTranspose1d(1024, 256, 32, 2, 15) # [B x 256 x 32]/n self.dec9_nl = nn.PReLU()/n self.dec8 = nn.ConvTranspose1d(512, 256, 32, 2, 15) # [B x 256 x 64]/n self.dec8_nl = nn.PReLU()/n self.dec7 = nn.ConvTranspose1d(512, 128, 32, 2, 15) # [B x 128 x 128]/n self.dec7_nl = nn.PReLU()/n self.dec6 = nn.ConvTranspose1d(256, 128, 32, 2, 15) # [B x 128 x 256]/n self.dec6_nl = nn.PReLU()/n self.dec5 = nn.ConvTranspose1d(256, 64, 32, 2, 15) # [B x 64 x 512]/n self.dec5_nl = nn.PReLU()/n self.dec4 = nn.ConvTranspose1d(128, 64, 32, 2, 15) # [B x 64 x 1024]/n self.dec4_nl = nn.PReLU()/n self.dec3 = nn.ConvTranspose1d(128, 32, 32, 2, 15) # [B x 32 x 2048]/n self.dec3_nl = nn.PReLU()/n self.dec2 = nn.ConvTranspose1d(64, 32, 32, 2, 15) # [B x 32 x 4096]/n self.dec2_nl = nn.PReLU()/n self.dec1 = nn.ConvTranspose1d(64, 16, 32, 2, 15) # [B x 16 x 8192]/n self.dec1_nl = nn.PReLU()/n self.dec_final = nn.ConvTranspose1d(32, 1, 32, 2, 15) # [B x 1 x 16384]/n self.dec_tanh = nn.Tanh()/n/n # initialize weights/n self.init_weights()/n/n def init_weights(self):/n 'Initialize weights for convolution layers using Xavier initialization.'/n for m in self.modules():/n if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):/n nn.init.xavier_normal(m.weight.data)/n/n def forward(self, x, z):/n 'Forward pass of generator./n Args:/n x: input batch (signal)/n z: latent vector/n '/n # encoding step/n e1 = self.enc1(x)/n e2 = self.enc2(self.enc1_nl(e1))/n e3 = self.enc3(self.enc2_nl(e2))/n e4 = self.enc4(self.enc3_nl(e3))/n e5 = self.enc5(self.enc4_nl(e4))/n e6 = self.enc6(self.enc5_nl(e5))/n e7 = self.enc7(self.enc6_nl(e6))/n e8 = self.enc8(self.enc7_nl(e7))/n e9 = self.enc9(self.enc8_nl(e8))/n e10 = self.enc10(self.enc9_nl(e9))/n e11 = self.enc11(self.enc10_nl(e10))/n # c = compressed feature, the 'thought vector'/n c = self.enc11_nl(e11)/n/n # concatenate the thought vector with latent variable/n encoded = torch.cat((c, z), dim=1)/n/n # decoding step/n d10 = self.dec10(encoded)/n # dx_c : concatenated with skip-connected layer's output & passed nonlinear layer/n d10_c = self.dec10_nl(torch.cat((d10, e10), dim=1))/n d9 = self.dec9(d10_c)/n d9_c = self.dec9_nl(torch.cat((d9, e9), dim=1))/n d8 = self.dec8(d9_c)/n d8_c = self.dec8_nl(torch.cat((d8, e8), dim=1))/n d7 = self.dec7(d8_c)/n d7_c = self.dec7_nl(torch.cat((d7, e7), dim=1))/n d6 = self.dec6(d7_c)/n d6_c = self.dec6_nl(torch.cat((d6, e6), dim=1))/n d5 = self.dec5(d6_c)/n d5_c = self.dec5_nl(torch.cat((d5, e5), dim=1))/n d4 = self.dec4(d5_c)/n d4_c = self.dec4_nl(torch.cat((d4, e4), dim=1))/n d3 = self.dec3(d4_c)/n d3_c = self.dec3_nl(torch.cat((d3, e3), dim=1))/n d2 = self.dec2(d3_c)/n d2_c = self.dec2_nl(torch.cat((d2, e2), dim=1))/n d1 = self.dec1(d2_c)/n d1_c = self.dec1_nl(torch.cat((d1, e1), dim=1))/n out = self.dec_tanh(self.dec_final(d1_c))/n return out/n代码详细逐句分析内容:这是一个生成对抗网络(GAN)的生成器模型,用于生成音频增强信号。以下是对代码的详细逐句分析:/n/npython/nclass Generator(nn.Module):/n '''G'''/n/n def __init__(self):/n super().__init__()/n/n/n定义了一个名为Generator的类,继承了nn.Module类。/n/npython/n # encoder gets a noisy signal as input [B x 1 x 16384]/n self.enc1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=32, stride=2, padding=15) # [B x 16 x 8192]/n self.enc1_nl = nn.PReLU()/n self.enc2 = nn.Conv1d(16, 32, 32, 2, 15) # [B x 32 x 4096]/n self.enc2_nl = nn.PReLU()/n self.enc3 = nn.Conv1d(32, 32, 32, 2, 15) # [B x 32 x 2048]/n self.enc3_nl = nn.PReLU()/n self.enc4 = nn.Conv1d(32, 64, 32, 2, 15) # [B x 64 x 1024]/n self.enc4_nl = nn.PReLU()/n self.enc5 = nn.Conv1d(64, 64, 32, 2, 15) # [B x 64 x 512]/n self.enc5_nl = nn.PReLU()/n self.enc6 = nn.Conv1d(64, 128, 32, 2, 15) # [B x 128 x 256]/n self.enc6_nl = nn.PReLU()/n self.enc7 = nn.Conv1d(128, 128, 32, 2, 15) # [B x 128 x 128]/n self.enc7_nl = nn.PReLU()/n self.enc8 = nn.Conv1d(128, 256, 32, 2, 15) # [B x 256 x 64]/n self.enc8_nl = nn.PReLU()/n self.enc9 = nn.Conv1d(256, 256, 32, 2, 15) # [B x 256 x 32]/n self.enc9_nl = nn.PReLU()/n self.enc10 = nn.Conv1d(256, 512, 32, 2, 15) # [B x 512 x 16]/n self.enc10_nl = nn.PReLU()/n self.enc11 = nn.Conv1d(512, 1024, 32, 2, 15) # [B x 1024 x 8]/n self.enc11_nl = nn.PReLU()/n/n/n定义了一系列卷积层,用于编码输入信号。这些层的作用是将输入的噪声信号进行特征提取,生成一个压缩的特征向量。这些卷积层的细节如下:/n/n- Conv1d层的输入通道数为1,输出通道数为16,卷积核大小为32,步幅为2,填充为15。/n- PReLU层是一个带参数的修正线性单元,用于对Conv1d层的输出进行非线性变换。/n- Conv1d层的输入通道数为16,输出通道数为32,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为32,输出通道数为32,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为32,输出通道数为64,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为64,输出通道数为64,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为64,输出通道数为128,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为128,输出通道数为128,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为128,输出通道数为256,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为256,输出通道数为256,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为256,输出通道数为512,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- Conv1d层的输入通道数为512,输出通道数为1024,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n/npython/n # decoder generates an enhanced signal/n # each decoder output are concatenated with homologous encoder output,/n # so the feature map sizes are doubled/n self.dec10 = nn.ConvTranspose1d(in_channels=2048, out_channels=512, kernel_size=32, stride=2, padding=15)/n self.dec10_nl = nn.PReLU() # out : [B x 512 x 16] -> (concat) [B x 1024 x 16]/n self.dec9 = nn.ConvTranspose1d(1024, 256, 32, 2, 15) # [B x 256 x 32]/n self.dec9_nl = nn.PReLU()/n self.dec8 = nn.ConvTranspose1d(512, 256, 32, 2, 15) # [B x 256 x 64]/n self.dec8_nl = nn.PReLU()/n self.dec7 = nn.ConvTranspose1d(512, 128, 32, 2, 15) # [B x 128 x 128]/n self.dec7_nl = nn.PReLU()/n self.dec6 = nn.ConvTranspose1d(256, 128, 32, 2, 15) # [B x 128 x 256]/n self.dec6_nl = nn.PReLU()/n self.dec5 = nn.ConvTranspose1d(256, 64, 32, 2, 15) # [B x 64 x 512]/n self.dec5_nl = nn.PReLU()/n self.dec4 = nn.ConvTranspose1d(128, 64, 32, 2, 15) # [B x 64 x 1024]/n self.dec4_nl = nn.PReLU()/n self.dec3 = nn.ConvTranspose1d(128, 32, 32, 2, 15) # [B x 32 x 2048]/n self.dec3_nl = nn.PReLU()/n self.dec2 = nn.ConvTranspose1d(64, 32, 32, 2, 15) # [B x 32 x 4096]/n self.dec2_nl = nn.PReLU()/n self.dec1 = nn.ConvTranspose1d(64, 16, 32, 2, 15) # [B x 16 x 8192]/n self.dec1_nl = nn.PReLU()/n self.dec_final = nn.ConvTranspose1d(32, 1, 32, 2, 15) # [B x 1 x 16384]/n self.dec_tanh = nn.Tanh()/n/n/n定义了一系列转置卷积层,用于解码压缩的特征向量,生成增强信号。这些层的作用是根据压缩的特征向量生成一个音频增强信号。这些转置卷积层的细节如下:/n/n- ConvTranspose1d层的输入通道数为2048,输出通道数为512,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为1024,输出通道数为256,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为512,输出通道数为256,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为512,输出通道数为128,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为256,输出通道数为128,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为256,输出通道数为64,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为128,输出通道数为64,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为128,输出通道数为32,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为64,输出通道数为32,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为64,输出通道数为16,卷积核大小为32,步幅为2,填充为15。/n- PReLU层同上。/n- ConvTranspose1d层的输入通道数为32,输出通道数为1,卷积核大小为32,步幅为2,填充为15。/n- Tanh层是一个双曲正切函数,用于将输出范围限制在-1到1之间。/n/n```python/n # initialize weights/n self.init_weights()/n/n def init_weights(self):/n '''/n Initialize weights for convolution layers using Xavier initialization./n '''/n for m in self.modules():/n if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):/n nn.init.xavier_normal(m.weight.data)/n/n def forward(self, x, z):/n '''/n Forward pass of generator./n Args:/n x: input batch (signal)/n z: latent vector/n '''/n # encoding step/n e1 = self.enc1(x)/n e2 = self.enc2(self.enc1_nl(e1))/n e3 = self.enc3(self.enc2_nl(e2))/n e4 = self.enc4(self.enc3_nl(e3))/n e5 = self.enc5(self.enc4_nl(e4))/n e6 = self.enc6(self.enc5_nl(e5))/n e7 = self.enc7(self.enc6_nl(e6))/n e8 = self.enc8(self.enc7_nl(e7))/n e9 = self.enc9(self.enc8_nl(e8))/n e10 = self.enc10(self.enc9_nl(e9))/n e11 = self.enc11(self.enc10_nl(e10))/n # c = compressed feature, the 'thought vector'/n c = self.enc11_nl(e11)/n/n # concatenate the thought vector with latent variable/n encoded = torch.cat((c, z), dim=1)/n/n # decoding step/n d10 = self.dec10(encoded)/n # dx_c : concatenated with skip-connected layer's output & passed nonlinear layer/n d10_c = self.dec10_nl(torch.cat((d10, e10), dim=1))/n d9 = self.dec9(d10_c)/n d9_c = self.dec9_nl(torch.cat((d9, e9), dim=1))/n d8 = self.dec8(d9_c)/n d8_c = self.dec8_nl(torch.cat((d8, e8), dim=1))/n d7 = self.dec7(d8_c)/n d7_c = self.dec7_nl(torch.cat((d7, e7), dim=1))/n d6 = self.dec6(d7_c)/n d6_c = self.dec6_nl(torch.cat((d6, e6), dim=1))/n d5 = self.dec5(d6_c)/n d5_c = self.dec5_nl(torch.cat((d5, e5), dim=1))/n d4 = self.dec4(d5_c)/n d4_c = self.dec4_nl(torch.cat((d4, e4), dim=1))/n d3 = self.dec3(d4_c)/n d3_c = self.dec3_nl(torch.cat((d3, e3), dim=1))/n d2 = self.dec2(d3_c)/n d2_c = self.dec2_nl(torch.cat((d2, e2), dim=1))/n d1 = self.dec1(d2_c)/n d1_c = self.dec1_nl(torch.cat((d1, e1), dim=1))/n out = self.dec_tanh(self.dec_final(d1_c))/n return out/
原文地址: https://www.cveoy.top/t/topic/n6B2 著作权归作者所有。请勿转载和采集!