PyTorch 实现 CNN 音频降噪模型:完整代码和评估
PyTorch 实现 CNN 音频降噪模型:完整代码和评估
本教程提供 PyTorch 实现 CNN 音频降噪模型的完整代码,涵盖数据加载、数据预处理、模型构建、音频降噪、模型评估等内容。
数据加载
import os
import librosa
import numpy as np
import torch
from torch.utils.data import Dataset
class AudioDataset(Dataset):
def __init__(self, data_dir):
self.data_dir = data_dir
self.file_list = os.listdir(data_dir)
def __len__(self):
return len(self.file_list)
def __getitem__(self, idx):
file_name = self.file_list[idx]
file_path = os.path.join(self.data_dir, file_name)
# 加载音频文件
y, sr = librosa.load(file_path, sr=None, mono=True)
return y, sr, file_name
数据处理
import torch.nn as nn
import torch.nn.functional as F
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1)
self.bn1 = nn.BatchNorm1d(32)
self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
self.bn2 = nn.BatchNorm1d(64)
self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1)
self.bn3 = nn.BatchNorm1d(128)
self.conv4 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
self.bn4 = nn.BatchNorm1d(256)
self.conv5 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1)
self.bn5 = nn.BatchNorm1d(512)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
x = F.relu(self.bn4(self.conv4(x)))
x = F.relu(self.bn5(self.conv5(x)))
return x
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.conv6 = nn.ConvTranspose1d(in_channels=512, out_channels=256, kernel_size=3, stride=2, padding=1, output_padding=1)
self.bn6 = nn.BatchNorm1d(256)
self.conv7 = nn.ConvTranspose1d(in_channels=256, out_channels=128, kernel_size=3, stride=2, padding=1, output_padding=1)
self.bn7 = nn.BatchNorm1d(128)
self.conv8 = nn.ConvTranspose1d(in_channels=128, out_channels=64, kernel_size=3, stride=2, padding=1, output_padding=1)
self.bn8 = nn.BatchNorm1d(64)
self.conv9 = nn.ConvTranspose1d(in_channels=64, out_channels=32, kernel_size=3, stride=2, padding=1, output_padding=1)
self.bn9 = nn.BatchNorm1d(32)
self.conv10 = nn.ConvTranspose1d(in_channels=32, out_channels=1, kernel_size=3, stride=2, padding=1, output_padding=1)
def forward(self, x):
x = F.relu(self.bn6(self.conv6(x)))
x = F.relu(self.bn7(self.conv7(x)))
x = F.relu(self.bn8(self.conv8(x)))
x = F.relu(self.bn9(self.conv9(x)))
x = torch.sigmoid(self.conv10(x))
return x
class DenoiseNet(nn.Module):
def __init__(self):
super(DenoiseNet, self).__init__()
self.encoder = Encoder()
self.decoder = Decoder()
def forward(self, x):
z = self.encoder(x)
x_hat = self.decoder(z)
return x_hat
模型
import os
import librosa
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader
def train(model, train_dataset, val_dataset, batch_size, num_epochs, lr, save_path):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
train_loss_list = []
val_loss_list = []
for epoch in range(num_epochs):
train_loss = 0.0
val_loss = 0.0
model.train()
for batch_idx, (y, sr, file_name) in enumerate(tqdm(train_loader)):
# 转换为张量
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)
# 前向传播
x_hat = model(y)
# 计算损失
loss = criterion(x_hat, y)
# 反向传播和更新
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item() * y.size(0)
model.eval()
with torch.no_grad():
for batch_idx, (y, sr, file_name) in enumerate(tqdm(val_loader)):
# 转换为张量
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)
# 前向传播
x_hat = model(y)
# 计算损失
loss = criterion(x_hat, y)
val_loss += loss.item() * y.size(0)
train_loss /= len(train_dataset)
val_loss /= len(val_dataset)
train_loss_list.append(train_loss)
val_loss_list.append(val_loss)
print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}'.format(epoch+1, num_epochs, train_loss, val_loss))
# 保存模型
if save_path is not None and (epoch+1) % 10 == 0:
torch.save(model.state_dict(), os.path.join(save_path, 'epoch_{}.pt'.format(epoch+1)))
return train_loss_list, val_loss_list
def test(model, test_dataset, batch_size, save_dir):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
with torch.no_grad():
for batch_idx, (y, sr, file_name) in enumerate(tqdm(test_loader)):
# 转换为张量
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)
# 前向传播
x_hat = model(y)
# 转换为 NumPy 数组
y = y.cpu().squeeze().numpy()
x_hat = x_hat.cpu().squeeze().numpy()
# 保存音频文件
for i in range(y.shape[0]):
file_path = os.path.join(save_dir, file_name[i])
librosa.output.write_wav(file_path, x_hat[i], sr=sr[i])
存储降噪后的音频
import os
import librosa
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
def test(model, test_dataset, batch_size, save_dir):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
model.eval()
with torch.no_grad():
for batch_idx, (y, sr, file_name) in enumerate(tqdm(test_loader)):
# 转换为张量
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)
# 前向传播
x_hat = model(y)
# 转换为 NumPy 数组
y = y.cpu().squeeze().numpy()
x_hat = x_hat.cpu().squeeze().numpy()
# 保存音频文件
for i in range(y.shape[0]):
file_path = os.path.join(save_dir, file_name[i])
librosa.output.write_wav(file_path, x_hat[i], sr=sr[i])
评估指标
import os
import librosa
import numpy as np
from pesq import pesq
from tqdm import tqdm
def evaluate(model, test_dataset, batch_size, clean_dir, noisy_dir):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# PESQ
clean_pesq_list = []
noisy_pesq_list = []
denoise_pesq_list = []
# STOI
clean_stoi_list = []
noisy_stoi_list = []
denoise_stoi_list = []
model.eval()
with torch.no_grad():
for batch_idx, (y, sr, file_name) in enumerate(tqdm(test_loader)):
# 转换为张量
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1).to(device)
# 前向传播
x_hat = model(y)
# 转换为 NumPy 数组
y = y.cpu().squeeze().numpy()
x_hat = x_hat.cpu().squeeze().numpy()
# 计算 PESQ 和 STOI
for i in range(y.shape[0]):
# 加载干净和噪声音频文件
clean_path = os.path.join(clean_dir, file_name[i])
noisy_path = os.path.join(noisy_dir, file_name[i])
clean_audio, _ = librosa.load(clean_path, sr=None, mono=True)
noisy_audio, _ = librosa.load(noisy_path, sr=None, mono=True)
# 计算 PESQ
clean_pesq = pesq(sr, clean_audio, noisy_audio, 'wb')
noisy_pesq = pesq(sr, noisy_audio, noisy_audio, 'wb')
denoise_pesq = pesq(sr, clean_audio, x_hat[i], 'wb')
clean_pesq_list.append(clean_pesq)
noisy_pesq_list.append(noisy_pesq)
denoise_pesq_list.append(denoise_pesq)
# 计算 STOI
clean_stoi = stoi(clean_audio, noisy_audio, sr, extended=False)
noisy_stoi = stoi(noisy_audio, noisy_audio, sr, extended=False)
denoise_stoi = stoi(clean_audio, x_hat[i], sr, extended=False)
clean_stoi_list.append(clean_stoi)
noisy_stoi_list.append(noisy_stoi)
denoise_stoi_list.append(denoise_stoi)
# 计算平均 PESQ 和 STOI
clean_pesq_mean = np.mean(clean_pesq_list)
noisy_pesq_mean = np.mean(noisy_pesq_list)
denoise_pesq_mean = np.mean(denoise_pesq_list)
clean_stoi_mean = np.mean(clean_stoi_list)
noisy_stoi_mean = np.mean(noisy_stoi_list)
denoise_stoi_mean = np.mean(denoise_stoi_list)
# 打印评估结果
print('PESQ:')
print('Clean: {:.4f}'.format(clean_pesq_mean))
print('Noisy: {:.4f}'.format(noisy_pesq_mean))
print('Denoise: {:.4f}'.format(denoise_pesq_mean))
print('STOI:')
print('Clean: {:.4f}'.format(clean_stoi_mean))
print('Noisy: {:.4f}'.format(noisy_stoi_mean))
print('Denoise: {:.4f}'.format(denoise_stoi_mean))
原文地址: https://www.cveoy.top/t/topic/nsMR 著作权归作者所有。请勿转载和采集!