音频特征图生成与尺寸统一调整

import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image


# 定义待删除文件的后缀名
file_suffix = '.png'

# 定义待删除文件夹的路径
folder_path = 'D:/论文代码/casia汉语情感语料库/'

# 遍历整个目录树，查找并删除所有后缀名为file_suffix的文件
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(file_suffix):
            file_path = os.path.join(root, file)
            os.remove(file_path)
            print("已经成功删除文件：", file_path)

# 设置数据集路径和语谱图保存路径
dataset_path = "D:/论文代码/casia汉语情感语料库/"
spectrogram_path = "D:/论文代码/语谱图/"

# 遍历数据集中各个子目录，对每个文件夹下的音频文件生成对应的语谱图
for subfolder in os.listdir(dataset_path):
    subfolder_path = os.path.join(dataset_path, subfolder)
    # 确保当前为子目录而不是文件
    if os.path.isdir(subfolder_path):
        for audio_file in os.listdir(subfolder_path):
            audio_path = os.path.join(subfolder_path, audio_file)
            # 判断是否为音频文件，目前仅支持.wav格式的音频
            if audio_file.endswith(".wav"):
                # 读取音频文件并进行短时傅里叶变换计算得到语谱图
                y, sr = librosa.load(audio_path, sr=None)
                D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
                # 创建保存该语谱图的文件夹
                spectrogram_folder = os.path.join(spectrogram_path, subfolder)
                os.makedirs(spectrogram_folder, exist_ok=True)
                # 保存语谱图到对应的文件夹中
                spectrogram_path_full = os.path.join(spectrogram_folder, f"{os.path.splitext(audio_file)[0]}.png")
                plt.figure(figsize=(10, 4))
                librosa.display.specshow(D, y_axis='linear')
                plt.colorbar(format='%+2.0f dB')
                plt.savefig(spectrogram_path_full, bbox_inches='tight', pad_inches=0)
                plt.clf()
                plt.close('all')

# 设置数据集路径
dataset_dir = "D:/论文代码/casia汉语情感语料库/"

# 循环遍历所有子目录
for subdir, dir, files in os.walk(dataset_dir):
    for file in files:
        filepath = os.path.join(subdir, file)
        # 如果是音频文件，则进行处理
        if filepath.endswith(".wav"):
            # 加载音频文件
            y, sr = librosa.load(filepath, sr=None)
            # 计算对数梅尔谱图
            S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
            log_S = librosa.power_to_db(S, ref=np.max)
            # 画图并保存
            plt.figure(figsize=(12,4))
            librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
            plt.title('Mel power spectrogram')
            plt.colorbar(format='%+02.0f dB')
            plt.tight_layout()
            specpath = os.path.join(subdir.replace(dataset_dir, "D:/论文代码/spectrograms/"), file+".png")
            os.makedirs(os.path.dirname(specpath), exist_ok=True)
            plt.savefig(specpath)
            plt.close()

import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt


# 定义函数将音频信号转化为MFCC图像并保存为png格式
def audio_to_mfcc(audio_path, save_path):
    # 读取音频文件
    y, sr = librosa.load(audio_path, sr=None)
    # 计算MFCC系数
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    # 绘制MFCC图像
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfcc, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    # 调整图像大小为(224, 224)
    image = Image.fromarray(np.uint8(plt.gcf().canvas.renderer.buffer_rgba()))
    image = image.resize((224, 224))
    # 保存MFCC图像为png格式
    save_name = os.path.splitext(os.path.basename(audio_path))[0] + '.png'
    save_path = os.path.join(save_path, os.path.dirname(os.path.relpath(audio_path, start=data_path)))
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    image.save(os.path.join(save_path, save_name))
    plt.close()


# 设置数据集路径和MFCC图像保存路径
data_path = 'D:/论文代码/casia汉语情感语料库/'
save_path = 'D:/论文代码/MFCC/'

# 遍历数据集中的所有音频文件，并转化为MFCC图像
for root, dirs, files in os.walk(data_path):
    for file in files:
        if file.endswith('.wav'):
            audio_path = os.path.join(root, file)
            audio_to_mfcc(audio_path, save_path)
            print('Convert audio to MFCC:', audio_path)

import os
import numpy as np
from PIL import Image

# 获取所有子目录
def get_subdirs(root_dir):
    subdirs = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for dirname in dirnames:
            subdir = os.path.join(dirpath, dirname)
            subdirs.append(subdir)
    return subdirs

# 加载图片并转换为numpy数组
def load_image(image_path):
    image = Image.open(image_path)
    image = image.convert('L') # 转换为灰度图像
    image_data = np.array(image)
    return image_data

# 将numpy数组保存为图片
def save_image(image_data, save_path):
    image = Image.fromarray(image_data)
    image.save(save_path)

# 获取特征图乘积
def get_feature_product(feature_dirs):
    feature_product = None
    for feature_dir in feature_dirs:
        subdirs = get_subdirs(feature_dir)
        for subdir in subdirs:
            save_dir = os.path.join('D:/论文代码/特征图乘积', os.path.relpath(subdir, feature_dir))
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            filenames = os.listdir(subdir)
            for i in range(len(filenames)):
                filename = filenames[i]
                if i == 0:
                    feature_product = load_image(os.path.join(subdir, filename))
                    # 调整图像大小为(224, 224)
                    feature_product = Image.fromarray(feature_product)
                    feature_product = feature_product.resize((224, 224))
                    feature_product = np.array(feature_product)
                else:
                    feature = load_image(os.path.join(subdir, filename))
                    # 调整图像大小为(224, 224)
                    feature = Image.fromarray(feature)
                    feature = feature.resize((224, 224))
                    feature = np.array(feature)
                    feature_product *= feature
            save_path = os.path.join(save_dir, filename)
            save_image(feature_product, save_path)
    print('特征图乘积已保存至D:/论文代码/特征图乘积')

# 对应的特征图进行特征图相乘
if __name__ == '__main__':
    feature_dirs = ['D:/论文代码/spectrograms', 'D:/论文代码/语谱图', 'D:/论文代码/MFCC']
    get_feature_product(feature_dirs)

代码中关键修改如下：

导入PIL库中的Image模块:

from PIL import Image

在生成特征图的函数中添加调整大小的代码:

def audio_to_mfcc(audio_path, save_path):
    # ...
    # 调整图像大小为(224, 224)
    image = Image.fromarray(np.uint8(plt.gcf().canvas.renderer.buffer_rgba()))
    image = image.resize((224, 224))
    # ...

在特征图相乘时，也添加调整大小的代码:

def get_feature_product(feature_dirs):
    # ...
    if i == 0:
        # ...
        # 调整图像大小为(224, 224)
        feature_product = Image.fromarray(feature_product)
        feature_product = feature_product.resize((224, 224))
        feature_product = np.array(feature_product)
    else:
        # ...
        # 调整图像大小为(224, 224)
        feature = Image.fromarray(feature)
        feature = feature.resize((224, 224))
        feature = np.array(feature)
        # ...

通过以上步骤，就能将所有特征图调整为统一的大小，方便后续处理。

注意:

调整图像大小的目标尺寸可以根据需求自行设定，例如 (224, 224) 或其他尺寸。
如果需要进行其他图像处理，例如裁剪、旋转等，可以在调整大小之前进行处理。
在进行特征图相乘之前，需要将图像转换为numpy数组，方便进行乘法运算。
调整图像大小会影响特征图的细节，需要根据实际情况选择合适的尺寸。