基于循环神经网络的中文拼音文本生成模型

该代码实现了一个基于循环神经网络的中文拼音文本生成模型。具体来说，代码实现了以下功能：

读取并处理拼音文本数据，包括生成字符到索引和索引到字符的字典，以及实现对输入序列的'one-hot'编码。

import torch
import random
import numpy as np

with open('/kaggle/input/pinyin-data/pinyin.txt', 'r', encoding='utf-8') as f:
    pinyin_data = f.read().split('\n')

print(len(pinyin_data))
print(pinyin_data[:10])

char_to_idx = {}
idx_to_char = {}

for i, char in enumerate(set(''.join(pinyin_data))):
    char_to_idx[char] = i
    idx_to_char[i] = char

print(char_to_idx)
print(idx_to_char)

vocab_size = len(char_to_idx)
print('Vocab size:', vocab_size)

def one_hot_encode(sequence, vocab_size, dtype=torch.float32):
    sequence_length = len(sequence)
    one_hot = torch.zeros(sequence_length, vocab_size, dtype=dtype)
    for i, char in enumerate(sequence):
        char_idx = char_to_idx[char]
        one_hot[i, char_idx] = 1
    return one_hot

test_sequence = 'ni hao'
test_one_hot = one_hot_encode(test_sequence, vocab_size)
print(test_one_hot)

实现两种数据批量加载方式，即随机采样和顺序划分。

def data_loader_random(text, batch_size, num_steps):
    corpus = text.replace('\n', ' ').replace('\r', ' ')
    corpus_one_hot = one_hot_encode(corpus, vocab_size)
    num_batches = corpus_one_hot.shape[0] // (batch_size * num_steps)
    corpus_one_hot = corpus_one_hot[:num_batches * batch_size * num_steps]
    corpus_one_hot = corpus_one_hot.view(batch_size, num_batches * num_steps, vocab_size)
    while True:
        for i in range(num_batches):
            batch = corpus_one_hot[:, i * num_steps : (i + 1) * num_steps, :]
            inputs = batch[:, :-1, :]
            targets = batch[:, 1:, :]
            yield inputs, targets

def data_loader_sequential(text, batch_size, num_steps):
    corpus = text.replace('\n', ' ').replace('\r', ' ')
    corpus_one_hot = one_hot_encode(corpus, vocab_size)
    num_batches = (corpus_one_hot.shape[0] - 1) // (batch_size * num_steps)
    corpus_one_hot = corpus_one_hot[:num_batches * batch_size * num_steps + 1]
    corpus_one_hot = corpus_one_hot.view(batch_size, -1, num_steps, vocab_size)
    while True:
        for i in range(num_batches):
            batch = corpus_one_hot[:, i * num_steps : (i + 1) * num_steps + 1, :, :]
            inputs = batch[:, :-1, :, :]
            targets = batch[:, 1:, :, :]
            yield inputs, targets

定义了一个基于循环神经网络的生成模型，可以选择使用RNN或GRU作为循环层。

class RNNModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, rnn_type='rnn', device='cpu'):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn_type = rnn_type
        self.device = device
        if rnn_type == 'rnn':
            self.rnn = torch.nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = torch.nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type: " + rnn_type)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x, h=None):
        batch_size = x.size(0)
        if h is None:
            h = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
        if self.rnn_type == 'rnn':
            out, h = self.rnn(x, h)
        elif self.rnn_type == 'gru':
            out, h = self.rnn(x, h)
        else:
            raise ValueError("Invalid RNN type: " + self.rnn_type)
        out = out.contiguous().view(-1, self.hidden_size)
        out = self.fc(out)
        return out, h

实现对生成模型的训练和预测功能，包括训练过程中的梯度裁剪和预测时的字符采样。

def train(model, data_loader, num_epochs, learning_rate, grad_clip, device='cpu'):
    model.to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        total_loss = 0.0
        n = 0
        for inputs, targets in data_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            batch_size = inputs.size(0)
            num_steps = inputs.size(1)
            h = None
            loss = 0.0
            model.zero_grad()
            for i in range(num_steps):
                x = inputs[:, i, :]
                y = targets[:, i, :]
                y_pred, h = model(x.unsqueeze(1), h)
                loss += criterion(y_pred, y.view(-1))
            loss /= num_steps
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            total_loss += loss.item() * batch_size
            n += batch_size
        print("Epoch %d, Loss: %.4f" % (epoch + 1, total_loss / n))

def predict(model, prefix, num_predictions, device='cpu'):
    model.to(device)
    with torch.no_grad():
        prefix_one_hot = one_hot_encode(prefix, vocab_size).to(device)
        _, h = model(prefix_one_hot.unsqueeze(0))
        y_pred = prefix[-1]
        for _ in range(num_predictions):
            x = one_hot_encode(y_pred, vocab_size).to(device)
            y_pred_one_hot, h = model(x.unsqueeze(0), h)
            y_pred = idx_to_char[y_pred_one_hot.argmax().item()]
            prefix += y_pred
        return prefix

# 随机采样
batch_size = 32
num_steps = 10
data_loader = data_loader_random('\n'.join(pinyin_data), batch_size, num_steps)
model = RNNModel(vocab_size, 128, vocab_size, 2, rnn_type='gru')
train(model, data_loader, num_epochs=10, learning_rate=0.001, grad_clip=1.0,device='gpu')

# 测试
prefix = 'ni'
num_predictions = 10
print(predict(model, prefix, num_predictions))

# 顺序划分
batch_size = 32
num_steps = 10
data_loader = data_loader_sequential('\n'.join(pinyin_data), batch_size, num_steps)
model = RNNModel(vocab_size, 128, vocab_size, 2, rnn_type='gru')
train(model, data_loader, num_epochs=10, learning_rate=0.001, grad_clip=1.0,device='gpu')

# 测试
prefix = 'ni'
num_predictions = 10
print(predict(model, prefix, num_predictions))

总的来说，代码实现了一个基本的中文拼音文本生成模型，并提供了两种数据批量加载方式和两种循环神经网络模型选择。需要注意的是，在训练和预测时需要选择合适的设备（GPU或CPU），以便提高运行速度和减少内存占用。