PyTorch 实现 CNN 音频降噪模型：完整代码和评估指标

由于该任务涉及到音频数据处理和评估指标，建议使用 Jupyter Notebook 或 Google Colab 运行代码。

首先，需要安装以下库：

Librosa：用于音频数据的加载和处理
PyTorch：用于构建和训练深度学习模型
soundfile：用于音频文件读写
numpy：用于数值计算和数组操作
tqdm：用于显示训练进度条

!pip install librosa torch soundfile numpy tqdm

接下来，加载数据。在这里，我们使用了一些预处理来减少音频数据的大小和采样率，以便更快地训练模型。我们还随机划分了数据集，其中 80％用于训练，20％用于测试。

import os
import librosa
import soundfile as sf
import numpy as np
from tqdm import tqdm

data_dir = 'data/'
os.makedirs(data_dir, exist_ok=True)

# 随机划分训练集和测试集
files = os.listdir('clean/')
np.random.shuffle(files)
train_files = files[:int(0.8*len(files))]
test_files = files[int(0.8*len(files)):]

# 加载训练集
x_train = []
y_train = []
for f in tqdm(train_files):
    # 加载干净音频
    clean, sr = librosa.load('clean/'+f, sr=16000, mono=True)
    # 加载带噪音频
    noisy, sr = librosa.load('noisy/'+f, sr=16000, mono=True)
    # 去除静态噪音
    noisy = librosa.effects.trim(noisy, top_db=20)[0]
    # 对齐长度
    length = min(len(clean), len(noisy))
    clean = clean[:length]
    noisy = noisy[:length]
    # 帧化
    frame_length = 320
    hop_length = 160
    for i in range(0, length-frame_length, hop_length):
        x_train.append(noisy[i:i+frame_length])
        y_train.append(clean[i:i+frame_length])

# 加载测试集
x_test = []
y_test = []
for f in tqdm(test_files):
    # 加载干净音频
    clean, sr = librosa.load('clean/'+f, sr=16000, mono=True)
    # 加载带噪音频
    noisy, sr = librosa.load('noisy/'+f, sr=16000, mono=True)
    # 去除静态噪音
    noisy = librosa.effects.trim(noisy, top_db=20)[0]
    # 对齐长度
    length = min(len(clean), len(noisy))
    clean = clean[:length]
    noisy = noisy[:length]
    # 帧化
    frame_length = 320
    hop_length = 160
    for i in range(0, length-frame_length, hop_length):
        x_test.append(noisy[i:i+frame_length])
        y_test.append(clean[i:i+frame_length])

# 转换为 numpy 数组
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

# 保存数据
np.save(data_dir+'x_train.npy', x_train)
np.save(data_dir+'y_train.npy', y_train)
np.save(data_dir+'x_test.npy', x_test)
np.save(data_dir+'y_test.npy', y_test)

接下来，定义我们的 CNN 降噪模型。在这里，我们使用了一个简单的 3 层 CNN，每层都有 32 个卷积核，加上 ReLU 激活函数和 BatchNormalization 层。

import torch
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=5, stride=1, padding=2)
        self.bn1 = nn.BatchNorm1d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(32, 32, kernel_size=5, stride=1, padding=2)
        self.bn2 = nn.BatchNorm1d(32)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv1d(32, 32, kernel_size=5, stride=1, padding=2)
        self.bn3 = nn.BatchNorm1d(32)
        self.relu3 = nn.ReLU(inplace=True)
        self.conv4 = nn.Conv1d(32, 1, kernel_size=5, stride=1, padding=2)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.conv4(x)
        return x

然后，我们定义训练函数。在这里，我们使用了 MSE 损失函数和 Adam 优化器。我们还使用了每个 epoch 后的验证集来评估模型，并计算信噪比和 MOS 评分。

import torch.optim as optim
from sklearn.metrics import mean_squared_error
from pesq import pesq
from pystoi.stoi import stoi

def train(model, epochs, batch_size, lr):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # 加载数据
    x_train = np.load(data_dir+'x_train.npy')
    y_train = np.load(data_dir+'y_train.npy')
    x_test = np.load(data_dir+'x_test.npy')
    y_test = np.load(data_dir+'y_test.npy')
    # 训练循环
    for epoch in range(epochs):
        # 训练
        model.train()
        running_loss = 0.0
        for i in range(0, len(x_train), batch_size):
            # 转换为张量
            x = torch.from_numpy(x_train[i:i+batch_size, np.newaxis, :]).float().to(device)
            y = torch.from_numpy(y_train[i:i+batch_size, np.newaxis, :]).float().to(device)
            # 正向传播
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            # 反向传播
            loss.backward()
            optimizer.step()
            # 统计 loss
            running_loss += loss.item() * len(x)
        train_loss = running_loss / len(x_train)
        # 验证
        with torch.no_grad():
            model.eval()
            running_loss = 0.0
            snr_sum = 0.0
            mos_sum = 0.0
            for i in range(0, len(x_test), batch_size):
                # 转换为张量
                x = torch.from_numpy(x_test[i:i+batch_size, np.newaxis, :]).float().to(device)
                y = torch.from_numpy(y_test[i:i+batch_size, np.newaxis, :]).float().to(device)
                # 正向传播
                outputs = model(x)
                loss = criterion(outputs, y)
                # 统计 loss
                running_loss += loss.item() * len(x)
                # 计算 SNR 和 MOS
                for j in range(len(x)):
                    clean = y[j].cpu().numpy().flatten()
                    denoised = outputs[j].cpu().numpy().flatten()
                    snr_sum += 20 * np.log10(np.linalg.norm(clean) / np.linalg.norm(clean - denoised))
                    mos_sum += pesq(16000, clean, denoised, 'wb') + stoi(clean, denoised, 16000)
            test_loss = running_loss / len(x_test)
            snr = snr_sum / len(x_test)
            mos = mos_sum / len(x_test)
            print('Epoch [{}/{}], Train Loss: {:.4f}, Test Loss: {:.4f}, SNR: {:.2f}, MOS: {:.2f}' 
                  .format(epoch+1, epochs, train_loss, test_loss, snr, mos))
    # 保存模型
    torch.save(model.state_dict(), 'model.pt')

最后，我们可以开始训练模型。

model = CNN()
train(model, epochs=50, batch_size=64, lr=0.001)

训练完成后，我们可以使用训练好的模型来降噪测试集中的音频，并计算信噪比和 MOS 评分。

model = CNN()
model.load_state_dict(torch.load('model.pt'))
model.eval()

x_test = np.load(data_dir+'x_test.npy')
y_test = np.load(data_dir+'y_test.npy')

snr_sum = 0.0
mos_sum = 0.0
for i in range(len(x_test)):
    # 转换为张量
    x = torch.from_numpy(x_test[i:i+1, np.newaxis, :]).float().to(device)
    y = torch.from_numpy(y_test[i:i+1, np.newaxis, :]).float().to(device)
    # 正向传播
    outputs = model(x)
    # 计算 SNR 和 MOS
    clean = y.cpu().numpy().flatten()
    denoised = outputs.cpu().numpy().flatten()
    snr_sum += 20 * np.log10(np.linalg.norm(clean) / np.linalg.norm(clean - denoised))
    mos_sum += pesq(16000, clean, denoised, 'wb') + stoi(clean, denoised, 16000)

snr = snr_sum / len(x_test)
mos = mos_sum / len(x_test)
print('SNR: {:.2f}, MOS: {:.2f}'.format(snr, mos))

最后，我们可以将降噪后的音频存储到磁盘并可视化。

os.makedirs('output/', exist_ok=True)

for i in range(len(x_test)):
    # 转换为张量
    x = torch.from_numpy(x_test[i:i+1, np.newaxis, :]).float().to(device)
    # 正向传播
    outputs = model(x)
    # 转换为 numpy 数组
    denoised = outputs.cpu().numpy().flatten()
    # 保存为 wav 文件
    sf.write('output/{}.wav'.format(i), denoised, 16000)

# 可视化一个样本
import matplotlib.pyplot as plt

clean = y_test[0]
noisy = x_test[0]
denoised = np.load('output/0.wav')

plt.figure(figsize=(10, 5))
plt.subplot(1, 3, 1)
plt.title('Clean')
plt.plot(clean)
plt.subplot(1, 3, 2)
plt.title('Noisy')
plt.plot(noisy)
plt.subplot(1, 3, 3)
plt.title('Denoised')
plt.plot(denoised)
plt.show()

以上就是 CNN 音频降噪模型的完整实现代码。