基于RNN的汉语拼音预测模型实现
import numpy as np import matplotlib.pyplot as plt import torch import collections # 统计出现次数用
读取数据
with open('D:\汉语音节表 (1).txt', encoding='UTF-8') as f: line = f.readline() # 只有一行
提取词元,本次的词元为字符
tokens = list(line) print('音节表中共有%d个字符' % len(tokens))
构建字符字典,用于将字符映射为数字
可以使用collections.Counter统计出现次数
counter = collections.Counter(tokens) count_pairs = sorted(counter.items(), key=lambda x: -x[1]) chars, _ = zip(*count_pairs) char2id = dict(zip(chars, range(len(chars)))) print('字符字典大小为%d' % len(char2id))
将所有字符转换为数字
token_ids = [char2id[char] for char in tokens] vocab_size = len(char2id)
将数据转换为Tensor
数据格式为(seq_len, batch_size)
def to_tensor(data, batch_size): num_batches = len(data) // batch_size data = data[:num_batches * batch_size] data = torch.tensor(data, dtype=torch.int64).view(batch_size, -1).t().contiguous() return data
数据划分,将数据集按照seq_len进行划分
def data_partition(data, seq_len): num_batches = data.shape[0] // seq_len data = data[:num_batches * seq_len] data = data.view(num_batches, seq_len) return data
构建模型
class RNNModel(torch.nn.Module): def init(self, rnn_type, input_size, hidden_size, output_size, num_layers): super(RNNModel, self).init() self.rnn_type = rnn_type self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers if rnn_type == 'RNN': self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True) elif rnn_type == 'GRU': self.rnn = torch.nn.GRU(input_size, hidden_size, num_layers, batch_first=True) self.fc = torch.nn.Linear(hidden_size, output_size)
def forward(self, inputs, hidden):
output, hidden = self.rnn(inputs, hidden)
output = self.fc(output)
return output, hidden
def init_hidden(self, batch_size):
if self.rnn_type == 'RNN':
return torch.zeros(self.num_layers, batch_size, self.hidden_size)
elif self.rnn_type == 'GRU':
return torch.zeros(self.num_layers, batch_size, self.hidden_size)
定义超参数
batch_size = 32 seq_len = 64 input_size = 1 hidden_size = 128 output_size = vocab_size num_layers = 1 learning_rate = 0.01
将数据集转换为Tensor
data = to_tensor(token_ids, batch_size)
数据集划分
train_data = data_partition(data[:-(seq_len+1)], seq_len) target_data = data_partition(data[1:-(seq_len)], seq_len)
创建模型、损失函数和优化器
model = RNNModel('RNN', input_size, hidden_size, output_size, num_layers) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
训练模型
num_epochs = 10 for epoch in range(num_epochs): hidden = model.init_hidden(batch_size) train_loss = 0.0 for i in range(train_data.shape[0]): inputs = train_data[i] targets = target_data[i].view(-1) # 将inputs转换为one-hot编码 inputs = torch.nn.functional.one_hot(inputs, num_classes=vocab_size).float() outputs, hidden = model(inputs, hidden) loss = criterion(outputs.view(-1, output_size), targets) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # 梯度截断 optimizer.step() train_loss += loss.item()
train_loss /= train_data.shape[0]
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, train_loss))
测试模型
def predict(model, prefix, hidden=None, k=1): # 将prefix转换为Tensor prefix = [char2id[char] for char in prefix] prefix = torch.tensor(prefix, dtype=torch.int64).view(len(prefix), 1) # 将prefix转换为one-hot编码 prefix_onehot = torch.nn.functional.one_hot(prefix, num_classes=vocab_size).float() # 进行预测 outputs, hidden = model(prefix_onehot, hidden) outputs = outputs[-1] # 取最后一个时间步的输出 # 进行k步预测 _, top_k_predictions = outputs.topk(k) top_k_predictions = top_k_predictions.tolist() # 将数字转换为字符 top_k_predictions = [chars[index] for index in top_k_predictions] return top_k_predictions, hidden
进行单步预测
prefix = '汉' predictions, _ = predict(model, prefix) print(' ' + prefix + ''.join(predictions))
进行k步预测
prefix = '汉' hidden = None for i in range(5): predictions, hidden = predict(model, prefix, hidden=hidden, k=3) print(' ' + prefix + ''.join(predictions))
原文地址: https://www.cveoy.top/t/topic/ovWQ 著作权归作者所有。请勿转载和采集!