PyTorch 实现 CNN 音频降噪模型:完整代码和评估指标
由于该任务涉及到音频数据处理和评估指标,建议使用 Jupyter Notebook 或 Google Colab 运行代码。
首先,需要安装以下库:
- Librosa:用于音频数据的加载和处理
- PyTorch:用于构建和训练深度学习模型
- soundfile:用于音频文件读写
- numpy:用于数值计算和数组操作
- tqdm:用于显示训练进度条
!pip install librosa torch soundfile numpy tqdm
接下来,加载数据。在这里,我们使用了一些预处理来减少音频数据的大小和采样率,以便更快地训练模型。我们还随机划分了数据集,其中 80% 用于训练,20% 用于测试。
import os
import librosa
import soundfile as sf
import numpy as np
from tqdm import tqdm
data_dir = 'data/'
os.makedirs(data_dir, exist_ok=True)
# 随机划分训练集和测试集
files = os.listdir('clean/')
np.random.shuffle(files)
train_files = files[:int(0.8*len(files))]
test_files = files[int(0.8*len(files)):]
# 加载训练集
x_train = []
y_train = []
for f in tqdm(train_files):
# 加载干净音频
clean, sr = librosa.load('clean/'+f, sr=16000, mono=True)
# 加载带噪音频
noisy, sr = librosa.load('noisy/'+f, sr=16000, mono=True)
# 去除静态噪音
noisy = librosa.effects.trim(noisy, top_db=20)[0]
# 对齐长度
length = min(len(clean), len(noisy))
clean = clean[:length]
noisy = noisy[:length]
# 帧化
frame_length = 320
hop_length = 160
for i in range(0, length-frame_length, hop_length):
x_train.append(noisy[i:i+frame_length])
y_train.append(clean[i:i+frame_length])
# 加载测试集
x_test = []
y_test = []
for f in tqdm(test_files):
# 加载干净音频
clean, sr = librosa.load('clean/'+f, sr=16000, mono=True)
# 加载带噪音频
noisy, sr = librosa.load('noisy/'+f, sr=16000, mono=True)
# 去除静态噪音
noisy = librosa.effects.trim(noisy, top_db=20)[0]
# 对齐长度
length = min(len(clean), len(noisy))
clean = clean[:length]
noisy = noisy[:length]
# 帧化
frame_length = 320
hop_length = 160
for i in range(0, length-frame_length, hop_length):
x_test.append(noisy[i:i+frame_length])
y_test.append(clean[i:i+frame_length])
# 转换为 numpy 数组
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)
# 保存数据
np.save(data_dir+'x_train.npy', x_train)
np.save(data_dir+'y_train.npy', y_train)
np.save(data_dir+'x_test.npy', x_test)
np.save(data_dir+'y_test.npy', y_test)
接下来,定义我们的 CNN 降噪模型。在这里,我们使用了一个简单的 3 层 CNN,每层都有 32 个卷积核,加上 ReLU 激活函数和 BatchNormalization 层。
import torch
import torch.nn as nn
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv1d(1, 32, kernel_size=5, stride=1, padding=2)
self.bn1 = nn.BatchNorm1d(32)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv1d(32, 32, kernel_size=5, stride=1, padding=2)
self.bn2 = nn.BatchNorm1d(32)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = nn.Conv1d(32, 32, kernel_size=5, stride=1, padding=2)
self.bn3 = nn.BatchNorm1d(32)
self.relu3 = nn.ReLU(inplace=True)
self.conv4 = nn.Conv1d(32, 1, kernel_size=5, stride=1, padding=2)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu2(x)
x = self.conv3(x)
x = self.bn3(x)
x = self.relu3(x)
x = self.conv4(x)
return x
然后,我们定义训练函数。在这里,我们使用了 MSE 损失函数和 Adam 优化器。我们还使用了每个 epoch 后的验证集来评估模型,并计算信噪比和 MOS 评分。
import torch.optim as optim
from sklearn.metrics import mean_squared_error
from pesq import pesq
from pystoi.stoi import stoi
def train(model, epochs, batch_size, lr):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
# 加载数据
x_train = np.load(data_dir+'x_train.npy')
y_train = np.load(data_dir+'y_train.npy')
x_test = np.load(data_dir+'x_test.npy')
y_test = np.load(data_dir+'y_test.npy')
# 训练循环
for epoch in range(epochs):
# 训练
model.train()
running_loss = 0.0
for i in range(0, len(x_train), batch_size):
# 转换为张量
x = torch.from_numpy(x_train[i:i+batch_size, np.newaxis, :]).float().to(device)
y = torch.from_numpy(y_train[i:i+batch_size, np.newaxis, :]).float().to(device)
# 正向传播
optimizer.zero_grad()
outputs = model(x)
loss = criterion(outputs, y)
# 反向传播
loss.backward()
optimizer.step()
# 统计 loss
running_loss += loss.item() * len(x)
train_loss = running_loss / len(x_train)
# 验证
with torch.no_grad():
model.eval()
running_loss = 0.0
snr_sum = 0.0
mos_sum = 0.0
for i in range(0, len(x_test), batch_size):
# 转换为张量
x = torch.from_numpy(x_test[i:i+batch_size, np.newaxis, :]).float().to(device)
y = torch.from_numpy(y_test[i:i+batch_size, np.newaxis, :]).float().to(device)
# 正向传播
outputs = model(x)
loss = criterion(outputs, y)
# 统计 loss
running_loss += loss.item() * len(x)
# 计算 SNR 和 MOS
for j in range(len(x)):
clean = y[j].cpu().numpy().flatten()
denoised = outputs[j].cpu().numpy().flatten()
snr_sum += 20 * np.log10(np.linalg.norm(clean) / np.linalg.norm(clean - denoised))
mos_sum += pesq(16000, clean, denoised, 'wb') + stoi(clean, denoised, 16000)
test_loss = running_loss / len(x_test)
snr = snr_sum / len(x_test)
mos = mos_sum / len(x_test)
print('Epoch [{}/{}], Train Loss: {:.4f}, Test Loss: {:.4f}, SNR: {:.2f}, MOS: {:.2f}'
.format(epoch+1, epochs, train_loss, test_loss, snr, mos))
# 保存模型
torch.save(model.state_dict(), 'model.pt')
最后,我们可以开始训练模型。
model = CNN()
train(model, epochs=50, batch_size=64, lr=0.001)
训练完成后,我们可以使用训练好的模型来降噪测试集中的音频,并计算信噪比和 MOS 评分。
model = CNN()
model.load_state_dict(torch.load('model.pt'))
model.eval()
x_test = np.load(data_dir+'x_test.npy')
y_test = np.load(data_dir+'y_test.npy')
snr_sum = 0.0
mos_sum = 0.0
for i in range(len(x_test)):
# 转换为张量
x = torch.from_numpy(x_test[i:i+1, np.newaxis, :]).float().to(device)
y = torch.from_numpy(y_test[i:i+1, np.newaxis, :]).float().to(device)
# 正向传播
outputs = model(x)
# 计算 SNR 和 MOS
clean = y.cpu().numpy().flatten()
denoised = outputs.cpu().numpy().flatten()
snr_sum += 20 * np.log10(np.linalg.norm(clean) / np.linalg.norm(clean - denoised))
mos_sum += pesq(16000, clean, denoised, 'wb') + stoi(clean, denoised, 16000)
snr = snr_sum / len(x_test)
mos = mos_sum / len(x_test)
print('SNR: {:.2f}, MOS: {:.2f}'.format(snr, mos))
最后,我们可以将降噪后的音频存储到磁盘并可视化。
os.makedirs('output/', exist_ok=True)
for i in range(len(x_test)):
# 转换为张量
x = torch.from_numpy(x_test[i:i+1, np.newaxis, :]).float().to(device)
# 正向传播
outputs = model(x)
# 转换为 numpy 数组
denoised = outputs.cpu().numpy().flatten()
# 保存为 wav 文件
sf.write('output/{}.wav'.format(i), denoised, 16000)
# 可视化一个样本
import matplotlib.pyplot as plt
clean = y_test[0]
noisy = x_test[0]
denoised = np.load('output/0.wav')
plt.figure(figsize=(10, 5))
plt.subplot(1, 3, 1)
plt.title('Clean')
plt.plot(clean)
plt.subplot(1, 3, 2)
plt.title('Noisy')
plt.plot(noisy)
plt.subplot(1, 3, 3)
plt.title('Denoised')
plt.plot(denoised)
plt.show()
以上就是 CNN 音频降噪模型的完整实现代码。
原文地址: https://www.cveoy.top/t/topic/nsNk 著作权归作者所有。请勿转载和采集!