导入相关库

import torch import torch.nn as nn import numpy as np

定义数据集

class PinyinDataset: def init(self, filename): self.vocab = set() self.char2idx = {} self.idx2char = {} self.data = [] with open(filename, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if len(line) == 0: continue pinyin, hanzi = line.split('\t') pinyin = pinyin.split(' ') hanzi = hanzi.split(' ') for c in pinyin: self.vocab.add(c) for c in hanzi: self.vocab.add(c) self.data.append((pinyin, hanzi)) self.vocab = sorted(list(self.vocab)) self.char2idx = {c: i for i, c in enumerate(self.vocab)} self.idx2char = {i: c for i, c in enumerate(self.vocab)}

def __len__(self):
    return len(self.data)

def __getitem__(self, index):
    pinyin, hanzi = self.data[index]
    x = torch.zeros(len(pinyin), len(self.vocab))
    y = torch.zeros(len(hanzi), len(self.vocab))
    for i, c in enumerate(pinyin):
        x[i, self.char2idx[c]] = 1
    for i, c in enumerate(hanzi):
        y[i, self.char2idx[c]] = 1
    return x, y

定义模型

class PinyinRNN(nn.Module): def init(self, input_size, hidden_size, output_size, num_layers=1, rnn_type='rnn'): super(PinyinRNN, self).init() self.hidden_size = hidden_size self.num_layers = num_layers self.rnn_type = rnn_type if rnn_type == 'rnn': self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, batch_first=True) elif rnn_type == 'gru': self.rnn = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True) self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x, h0=None):
    if h0 is None:
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
    if self.rnn_type == 'rnn':
        out, hn = self.rnn(x, h0)
    elif self.rnn_type == 'gru':
        out, hn = self.rnn(x, h0)
    out = self.fc(out)
    return out, hn

准备数据

dataset = PinyinDataset('/kaggle/input/pinyin-data/pinyin.txt') batch_size = 32 train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

定义模型超参数

input_size = len(dataset.vocab) hidden_size = 64 output_size = len(dataset.vocab) num_layers = 1 rnn_type = 'rnn'

定义模型和损失函数

model = PinyinRNN(input_size, hidden_size, output_size, num_layers=num_layers, rnn_type=rnn_type) criterion = nn.CrossEntropyLoss()

定义优化器

lr = 0.01 optimizer = torch.optim.Adam(model.parameters(), lr=lr)

训练模型

num_epochs = 100 clip = 5.0 for epoch in range(num_epochs): total_loss = 0 for i, (x, y) in enumerate(train_loader): optimizer.zero_grad() out, hn = model(x) loss = 0 for j in range(out.size(0)): loss += criterion(out[j], torch.argmax(y[j], dim=1)) loss /= out.size(0) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() total_loss += loss.item() print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, total_loss/(i+1)))

测试模型

prefix = 'ni hao' x = torch.zeros(len(prefix), len(dataset.vocab)) for i, c in enumerate(prefix): x[i, dataset.char2idx[c]] = 1 out, hn = model(x.unsqueeze(0)) pred = torch.argmax(out.squeeze(), dim=1) print(prefix + ''.join([dataset.idx2char[int(i)] for i in pred]))

K步预测

K = 10 prefix = 'ni hao' x = torch.zeros(len(prefix), len(dataset.vocab)) for i, c in enumerate(prefix): x[i, dataset.char2idx[c]] = 1 out, hn = model(x.unsqueeze(0)) pred = torch.argmax(out.squeeze(), dim=1).tolist() for i in range(K): x = torch.zeros(1, len(dataset.vocab)) x[0, pred[-1]] = 1 out, hn = model(x, hn) pred.append(torch.argmax(out.squeeze(), dim=1).item()) print(prefix + ''.join([dataset.idx2char[int(i)] for i in pred])

根据下面代码完成以下内容给出完善后的代码：使用循环神经网络学习汉语拼音的拼写本次实验重点为准备数据和模型。拼音数据无声调：kaggleinputpinyin-datapinyintxt定义数据集：采用字符模型因此一个字符为一个样本。每个样本采用one-hot编码。样本是时间相关的分别实现序列的随机采样和序列的顺序划分标签Y与X同形状但时间超前1准备数据：一次梯度更新使用的数据形状为：时间步Batc