用pytorch搭建循环神经网络RNN实现唐诗词生成任务

首先，我们需要准备好数据集。这里我们选择使用《全唐诗》作为我们的数据集。具体操作如下：

下载《全唐诗》文本文件，可以在以下链接中找到：https://github.com/Werneror/PoetryGeneration/tree/master/data
将下载的文本文件放在项目文件夹下的data目录中
使用以下代码将文本文件转换为可以被模型读取的形式，并保存为data.txt文件。

import os

path = './data/'
file_list = os.listdir(path)
file_list.remove('.DS_Store')

with open('data.txt', 'w', encoding='utf-8') as f:
    for file_name in file_list:
        with open(path + file_name, 'r', encoding='utf-8') as poem_file:
            for line in poem_file:
                if len(line) < 5:
                    continue
                if line[5] == '，' and len(line) > 7:
                    f.write(line[7:])
                elif line[5] == '。' and len(line) > 6:
                    f.write(line[6:])

接下来，我们开始搭建循环神经网络模型。我们使用LSTM作为我们的循环神经网络模型，并使用Embedding层将每个汉字转换为向量。具体代码如下：

import random
import torch
import torch.nn as nn
import torch.optim as optim

# 设置随机种子
random.seed(1)
torch.manual_seed(1)

# 读取数据集
with open('data.txt', 'r', encoding='utf-8') as f:
    data = f.read()

# 将每个汉字转换为数字
word_to_int = {word: i for i, word in enumerate(set(data))}
int_to_word = {i: word for i, word in enumerate(set(data))}
data = [word_to_int[word] for word in data]

# 定义模型
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.lstm(embedded.view(len(inputs), 1, -1), hidden)
        output = self.fc(output.view(len(inputs), -1))
        return output, hidden

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size))

# 定义超参数
input_size = len(word_to_int)
hidden_size = 128
output_size = len(word_to_int)
learning_rate = 0.01
num_epochs = 2000

# 初始化模型、损失函数和优化器
model = LSTM(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型
for epoch in range(num_epochs):
    # 初始化隐状态
    hidden = model.init_hidden()
    # 随机选择一个序列作为输入
    start_index = random.randint(0, len(data) - 30)
    end_index = start_index + 30
    inputs = torch.tensor(data[start_index:end_index], dtype=torch.long)
    targets = torch.tensor(data[start_index + 1:end_index + 1], dtype=torch.long)
    # 前向传播
    outputs, hidden = model(inputs, hidden)
    loss = criterion(outputs, targets)
    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # 每100轮打印一次loss
    if (epoch + 1) % 100 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, loss.item()))

# 使用模型生成唐诗
with torch.no_grad():
    hidden = model.init_hidden()
    # 随机选择一个汉字作为开始
    start_char = random.choice(list(word_to_int.keys()))
    inputs = torch.tensor([word_to_int[start_char]], dtype=torch.long)
    poem = start_char
    # 生成7句4字诗
    for i in range(7):
        # 每句4个汉字
        for j in range(4):
            output, hidden = model(inputs, hidden)
            # 选择概率最大的汉字作为下一个输入
            _, predicted = torch.max(output, dim=1)
            poem += int_to_word[predicted.item()]
            inputs = predicted
        poem += '\n'
    print(poem)

我们选择随机选择一个序列作为输入，并在每次训练时更新模型，以便更好地拟合数据。在训练模型时，我们使用交叉熵损失函数和Adam优化器进行优化。在训练完成后，我们使用模型生成7句4字诗，每句诗4个汉字