import torch
from torch import nn
embedding = nn.Embedding(5, 4) # 假定语料只有5个词,词向量维度为3
sents = [[1, 2, 3],
[2, 3, 4]] # 两个句子,how:1 are:2 you:3, are:2 you:3 ok:4
embed = embedding(torch.LongTensor(sents))
print(embed) # shape=(2
tensor([[[-0.6991, -0.3340, -0.7701, -0.6255],
[ 0.2969, 0.4720, -0.9403, 0.2982],
[ 0.8902, -1.0681, 0.4035, 0.1645]],
[[ 0.2969, 0.4720, -0.9403, 0.2982],
[ 0.8902, -1.0681, 0.4035, 0.1645],
[-0.7944, -0.1766, -1.5941, 0.4544]]], grad_fn=<EmbeddingBackward>)
2.2 nn.RNN
对RNN单元的改进有LSTM和GRU,这三种类型的模型的输入数据都需要3D的tensor,,,使用时设置b atch_first为true时,输入数据的shape为[batch,seq_length, input_dim],第一维为batch的数量不使用时设置为1,第二维序列的长度,第三维为输入的维度,通常为词嵌入的维度。
rnn = RNN(input_dim, hidden_dim, num_layers=1, batch_first, bidirectional)
input_dim 输入token的特征数量,使用embeding时为嵌入的维度
hidden_dim 隐层的单元数,决定RNN的输出长度
num_layers 层数
batch_frist 第一维为batch,反之第一堆为seq_len,默认为False
bidirectional 是否为双向RNN,默认为False
output, hidden = rnn(input, hidden)
input 一批输入数据,shape为[batch, seq_len, input_dim]
hidden 上一时刻的隐层状态,shape为[num_layers * num_directions, batch, hidden_dim]
output 当前时刻的输出,shape为[batch, seq_len, num_directions*hidden_dim]
import torch
from torch import nn
vocab_size = 5
embed_dim = 3
hidden_dim = 8
embedding = nn.Embedding(vocab_size, embed_dim)
rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
sents = [[1, 2, 4],
[2, 3, 4]]
h0 = torch.zeros(1, embeded.size(0), 8) # shape=(num_layers*num_directions, batch, hidden_dim)
embeded = embedding(torch.LongTensor(sents))
out, hidden = rnn(embeded, h0) # out.shape=(2,3,8), hidden.shape=(1,2,8)
print(out, hidden)
tensor([[[-0.1556, -0.2721, 0.1485, -0.2081, -0.2231, -0.1459, -0.0319, 0.2617],
[-0.0274, 0.1561, -0.0509, -0.1723, -0.2678, -0.2616, 0.0786, 0.4124],
[ 0.2346, 0.4487, -0.1409, -0.0807, -0.0232, -0.4975, 0.4244, 0.8337]],
[[ 0.0879, 0.1122, 0.1502, -0.3033, -0.2715, -0.1191, 0.1367, 0.5275],
[ 0.2258, 0.4395, -0.1365, 0.0135, -0.0777, -0.5221, 0.4683, 0.8115],
[ 0.0158, 0.3471, 0.0742, -0.0550, -0.0098, -0.5521, 0.5923,0.8782]]], grad_fn=<TransposeBackward0>)
tensor([[[ 0.2346, 0.4487, -0.1409, -0.0807, -0.0232, -0.4975, 0.4244, 0.8337],
[ 0.0158, 0.3471, 0.0742, -0.0550, -0.0098, -0.5521, 0.5923, 0.8782]]], grad_fn=<ViewBackward>)
2.3 nn.LSTM
lstm = LSTM(input_dim, hidden_dim, num_layers=1, batch_first=True, bidirectional)
input_dim 输入word的特征数量,使用embeding时为嵌入的维度
hidden_dim 隐层的单元数
output, (hidden, cell) = lstm(input, (hidden, cell))
input 一批输入数据,shape为[batch, seq_len, input_dim]
hidden 当前时刻的隐层状态,shape为[num_layers * num_directions, batch, hidden_dim]
cell 当前时刻的记忆状态,shape为[num_layers * num_directions, batch, hidden_dim]
output 当前时刻的输出,shape为[batch, seq_len, num_directions*hidden_dim]
2.4 nn.GRU
rnn = GRU(input_dim, hidden_dim, num_layers=1, batch_first=True, bidirectional)
input_dim 输入word的特征数量,使用embeding时为嵌入的维度
hidden_dim 隐层的单元数
output, hidden = rnn(input, hidden)
input 一批输入数据,shape为[batch, seq_len, input_dim]
hidden 上一时刻的隐层状态,shape为[num_layers*num_directions, batch, hidden_dim]
output 当前时刻的输出,shape为[batch, seq_len, num_directions*hidden_size]
2.5 损失函数
CrossEntropyLoss 交叉熵误差
x : 包含每个类的得分,2-D tensor, shape=(batch, n)
class: 长度为batch 的 1D tensor,每个数值为类别的索引(0到 n-1)
3 字符级RNN的分类应用
网络的训练和使用封装为Model类,提供三个方法。train(), evaluate(),predict()分别用于训练,评估和预测使用。具体见下面的代码及注释。
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import sklearn
import string
import random
from nltk.corpus import names
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
chars = string.ascii_lowercase + '-' + ' ' + "'"
abc = [ [1, 0, ...,0]
[0, 1, 0, ..]
[0, 0, 1, ..] ]
abc.shape = (len("abc"), len(chars))
def name2vec(name):
ids = [chars.index(c) for c in name if c not in ["\\"]]
a = np.zeros(shape=(len(ids), len(chars)))
for i, idx in enumerate(ids):
a[i][idx] = 1
return a
def load_data():
female_file, male_file = names.fileids()
f1_names = names.words(female_file)
f2_names = names.words(male_file)
data_set = [(name.lower(), 0) for name in f1_names] + [(name.lower(), 1) for name in f2_names]
data_set = [(name2vec(name), sexy) for name, sexy in data_set]
return data_set
class CharRNN(nn.Module):
def __init__(self, vocab_size, hidden_size, output_size):
super(CharRNN, self).__init__()
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.output_size = output_size
self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
self.liner = nn.Linear(hidden_size, output_size)
def forward(self, input):
h0 = torch.zeros(1, 1, self.hidden_size, device=device) # 初始hidden state
output, hidden = self.rnn(input, h0)
output = output[:, -1, :] # 只使用最终时刻的输出作为特征
output = self.liner(output)
output = F.softmax(output, dim=1)
return output
hidden_dim = 128
output_dim = 2
class Model:
def __init__(self, epoches=100):
self.model = CharRNN(len(chars), hidden_dim , output_dim)
self.epoches = epoches
def train(self, train_set):
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.0003)
for epoch in range(self.epoches):
total_loss = 0
for x in range(1000):# 每轮随机样本训练1000次
name, sexy = random.choice(train_set)
# RNN的input要求shape为[batch, seq_len, embed_dim],由于名字为变长,也不准备好将其填充为定长,因此batch_size取1,将取的名字放入单个元素的list中。
name_tensor = torch.tensor([name], dtype=torch.float, device=device)
# torch要求计算损失时,只提供类别的索引值,不需要one-hot表示
sexy_tensor = torch.tensor([sexy], dtype=torch.long, device=device)
pred = self.model(name_tensor) # [batch, out_dim]
loss = loss_func(pred, sexy_tensor)
total_loss += loss
print("Training: in epoch {} loss {}".format(epoch, total_loss/1000))
def evaluate(self, test_set):
with torch.no_grad(): # 评估时不进行梯度计算
correct = 0
for x in range(1000): # 从测试集中随机采样测试1000次
name, sexy = random.choice(test_set)
name_tensor = torch.tensor([name], dtype=torch.float, device=device)
pred = self.model(name_tensor)
if torch.argmax(pred).item() == sexy:
correct += 1
print('Evaluating: test accuracy is {}%'.format(correct/10.0))
def predict(self, name):
p = name2vec(name.lower())
name_tensor = torch.tensor([p], dtype=torch.float, device=device)
with torch.no_grad():
out = self.model(name_tensor)
out = torch.argmax(out).item()
sexy = 'female' if out == 0 else 'male'
print('{} is {}'.format(name, sexy))
if __name__ == "__main__":
model = Model(10)
data_set = load_data()
train, test = sklearn.model_selection.train_test_split(data_set)
Evaluating: test accuracy is 82.6%
Jim is male
Kate is female
4 基于字符级RNN的文本生成
模型由类CharRNN实现,模型的训练和使用由Model类实现,提供了train(), sample()方法,前者用于训练模型,后者用于从训练中进行采样生成。
# coding=utf-8
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import string
import random
import nltk
from nltk.corpus import names
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
# 使用符号!作为名字的结束标识
chars = string.ascii_lowercase + '-' + ' ' + "'" + '!'
hidden_dim = 128
output_dim = len(chars)
# name abc encode as [[1, ...], [0,1,...], [0,0,1...]]
def name2input(name):
ids = [chars.index(c) for c in name if c not in ["\\"]]
a = np.zeros(shape=(len(ids), len(chars)), dtype=np.long)
for i, idx in enumerate(ids):
a[i][idx] = 1
return a
# name abc encode as [0 1 2]
def name2target(name):
ids = [chars.index(c) for c in name if c not in ["\\"]]
return ids
# female=[[1, 0]] male=[[0,1]]
def sexy2input(sexy):
a = np.zeros(shape=(1, 2), dtype=np.long)
a[0][sexy] = 1
return a
def load_data():
female_file, male_file = names.fileids()
f1_names = names.words(female_file)
f2_names = names.words(male_file)
data_set = [(name.lower(), 0) for name in f1_names] + [(name.lower(), 1) for name in f2_names]
return data_set
[('yoshiko', 0), ('timothea', 0), ('giorgi', 1), ('thedrick', 1), ('tessie', 0), ('keith', 1), ('carena', 0), ('anthea', 0), ('cathyleen', 0), ('almeta', 0)]
class CharRNN(nn.Module):
def __init__(self, vocab_size, hidden_size, output_size):
super(CharRNN, self).__init__()
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.output_size = output_size
# 输入维度增加了性别的one-hot嵌入,dim+=2
self.rnn = nn.GRU(vocab_size+2, hidden_size, batch_first=True)
self.liner = nn.Linear(hidden_size, output_size)
def forward(self, sexy, name, hidden=None):
if hidden is None:
hidden = torch.zeros(1, 1, self.hidden_size, device=device) # 初始hidden state
# 对每个输入字符,将性别向量嵌入到头部
input = torch.cat([sexy, name], dim=2)
output, hidden = self.rnn(input, hidden)
output = self.liner(output)
output = F.dropout(output, 0.3)
output = F.softmax(output, dim=2)
return output.view(1, -1), hidden
class Model:
def __init__(self, epoches):
self.model = CharRNN(len(chars), hidden_dim , output_dim)
self.epoches = epoches
def train(self, train_set):
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(self.model.parameters(), lr=0.001)
for epoch in range(self.epoches):
total_loss = 0
for x in range(1000): # 每轮随机样本训练1000次
loss = 0
name, sexy = random.choice(train_set)
hidden = torch.zeros(1, 1, hidden_dim, device=device)
# 对于姓名kate,将kate作为输入,ate!作为训输出,依次将每个字符输入网络,以计算误差
for x, y in zip(list(name), list(name[1:]+'!')):
name_tensor = torch.tensor([name2input(x)], dtype=torch.float, device=device)
sexy_tensor = torch.tensor([sexy2input(sexy)], dtype=torch.float, device=device)
target_tensor = torch.tensor(name2target(y), dtype=torch.long, device=device)
pred, hidden = self.model(sexy_tensor, name_tensor, hidden)
loss += loss_func(pred, target_tensor)
total_loss += loss/(len(name) - 1)
print("Training: in epoch {} loss {}".format(epoch, total_loss/1000))
def sample(self, sexy, start):
max_len = 8
result = []
with torch.no_grad():
hidden = None
for c in start:
sexy_tensor = torch.tensor([sexy2input(sexy)], dtype=torch.float, device=device)
name_tensor = torch.tensor([name2input(c)], dtype=torch.float, device=device)
pred, hidden = self.model(sexy_tensor, name_tensor, hidden)
c = start[-1]
while c != '!':
sexy_tensor = torch.tensor([sexy2input(sexy)], dtype=torch.float, device=device)
name_tensor = torch.tensor([name2input(c)], dtype=torch.float, device=device)
pred, hidden = self.model(sexy_tensor, name_tensor, hidden)
topv, topi = pred.topk(1)
c = chars[topi]
# c = chars[torch.argmax(pred)]
if len(result) > max_len:
return start + "".join(result[:-1])
if __name__ == "__main__":
model = Model(10)
data_set = load_data()
print(model.sample(0, "ka"))
c = input('please input name prefix: ')
while c != 'q':
print(model.sample(1, c))
print(model.sample(0, c))
c = input('please input name prefix: ')
亿速云「云服务器」,即开即用、新一代英特尔至强铂金CPU、三副本存储NVMe SSD云盘,价格低至29元/月。点击查看>>