结构化数据新闻写作
热点新闻写作
投研报告写作
将诗中的每个字分别看成Y,将其前面出现的三个字看成X
建立逻辑回归模型,根据X,预测Y出现的概率
# 使用pandas读入数据
import pandas as pd
poems_text = pd.read_table('data/poems_clean.txt', header=None)
poems_text.columns = ["text"]
# 查看文本
print(poems_text.head())
print(poems_text.shape)
# 去除题目、空格
import string
import numpy as np
poems_new = []
for line in poems_text['text']: # poems_text[0]的第0列,指
title, poem = line.split(':')
poem = poem.replace(' ', '') #将空格去掉
poem = 'bbb' + poem
poems_new.append(list(poem))
print(len(poems_new))
# 生成X和Y的矩阵
XY =[]
for poem in poems_new:
for i in range(len(poem) - 3):
x1 = poem[i]
x2 = poem[i+1]
x3 = poem[i+2]
y = poem[i+3]
XY.append([x1, x2, x3, y])
# 展示整理后的X和Y的形式
print("原始诗句:")
print(poems_text['text'][3864])
print("\n")
print("训练数据:")
print(["X1", "X2", "X3", "Y"])
for i in range(132763, 132773):
print(XY[i])
# 文字编码
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poems_new)
print(tokenizer.word_index)
#tokenizer默认把0这个索引留给停止词了,其它的词是从1开始索引的。然后分类的损失又默认0是第一类
#所以导致假设我们有三个字:a,b,c。他们的索引分别是1,2,3,没有0。这对keras要使用一个4分类才行
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
XY_digit = np.array(tokenizer.texts_to_sequences(XY))
X_digit = XY_digit[:, :3]
Y_digit = XY_digit[:, 3]
for i in range(132763, 132773):
print("{:<35}".format(str(XY[i])), "\t", "{:<30}".format(str(list(X_digit[i]))),"\t", Y_digit[i])
# Embedding + 线性模型
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Dense, Activation, Embedding, Flatten
hidden_size = 300
inp = Input(shape=(3,))
x = Embedding(vocab_size, hidden_size)(inp)
x = Flatten()(x)
x = Dense(vocab_size)(x)
pred = Activation('softmax')(x)
lstm_model = Model(inp, pred)
lstm_model.summary()
# 训练模型
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_digit,Y_digit,test_size=0.2, random_state=0)
from tensorflow.keras.optimizers import Adam
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001))
lstm_model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=1000, epochs=1)
lstm_model.save('CNNPoem' + 'Model')
# model.save_weights('CNN.h5') # 文件类型是HDF5
# 模型效果检验
from tensorflow.keras.models import load_model
model = load_model('CNNPoem' + 'model')
# model.load_weights('CNN.h5')
sample_text = ['白', '日', '依']
print(sample_text)
sample_index = tokenizer.texts_to_sequences(sample_text)
print(sample_index)
word_prob = model.predict(np.array(sample_index).reshape(1, 3))[0]
print(tokenizer.index_word[word_prob.argmax()], word_prob.max())
# 模型应用
poem_incomplete = 'bbb风****花****雪****月****'
poem_index = []
poem_text = ''
for i in range(len(poem_incomplete)):
current_word = poem_incomplete[i]
if current_word != '*':
# 给定的词
index = tokenizer.word_index[current_word]
else:
# 根据前三个词预测 *
x = poem_index[-3:]
y = model.predict(np.expand_dims(x, axis=0))[0]
index = y.argmax()
current_word = tokenizer.index_word[index]
poem_index.append(index)
poem_text = poem_text + current_word
poem_text = poem_text[3:]
print(poem_text[0:5])
print(poem_text[5:10])
print(poem_text[10:15])
print(poem_text[15:20])
静夜思:床前明月光,疑似地上霜,举头望明月,低头思故乡
输入长度必须固定(例如:X=明,月,光)!问:能否长度任意?
输入没有记忆(例如:X=前,明,月;床?去哪里了?)!问:能否对历史有记忆!
RNN(Recurrent Neural Network),也叫循环神经网络,是一种专门处理“文本序列数据”的方法。
核心思想:通过将历史信息不断保留与传递,而保留与传递的载体就是状态$Z_t$。
RNN最早被认知科学与计算神经科学的研究人员提出并应用,后来被广泛应用于研究序列数据,下图展示了一些早期的RNN相关文献。
当前时刻的数据 + 上一时刻的状态 = 当前时刻的状态
每一次神经网络的判断过程之后,都会把信息传给下一次判断过程,就类似于我们人脑的思考理解过程
# 数据的读入与展示
import string
import numpy as np
f = open('data/poems_clean.txt', "r", encoding='utf-8')
poems = []
for line in f.readlines():
title, poem = line.split(':')
poem = poem.replace(' ', '') #将空格去掉
poem = poem.replace('\n', '') #将换行符去掉
poems.append(list(poem))
print(poems[0][:])
# 数据整理:文字编码
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poems)
poems_digit = tokenizer.texts_to_sequences(poems)
vocab_size = len(tokenizer.word_index) + 1 #加上停止词0
vocab_size #有多少个不同的字
#补全数据:为了将所有的诗放在一个M*N的np.array中,将每一首诗补0到同样的长度
poems_digit = pad_sequences(poems_digit, maxlen=50, padding='post')
print("原始诗歌")
print(poems[3864])
print("\n")
print("编码+补全后的结果")
print(poems_digit[3864])
# 对齐X和Y
X = poems_digit[:, :-1]
Y = poems_digit[:, 1:]
print(poems_digit.shape)
print(X.shape)
print(Y.shape)
print("X示例", "\t", "Y示例")
for i in range(10):
print(X[0][i], "\t", Y[0][i])
print("...", "\t", "...")
# 把$Y$变成One-Hot向量
print(vocab_size)
from tensorflow.keras.utils import to_categorical
Y = to_categorical(Y, num_classes=vocab_size)
print(Y.shape)
# 确定空间维度
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Embedding, Activation, BatchNormalization
embedding_size = 100
hidden_size = 200
# 构建RNN模型
inp = Input(shape=(49,))
# Encoder
x = Embedding(vocab_size, embedding_size, mask_zero=True)(inp)
x = SimpleRNN(hidden_size,return_sequences=True)(x)
# prediction
x = Dense(vocab_size)(x)
pred = Activation('softmax')(x)
model = Model(inp, pred)
model.summary()
# 模型训练
from tensorflow.keras.optimizers import Adam
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
model.fit(X, Y, epochs=1, batch_size=200, validation_split=0.2)
model.save('RNNPoem' + 'Model')
# 模型应用
from tensorflow.keras.models import load_model
model = load_model('RNNPoem' + 'model')
poem_incomplete = '床****疑****举****低****'
poem_index = []
poem_text = ''
for i in range(len(poem_incomplete)):
current_word = poem_incomplete[i]
if current_word != '*':
index = tokenizer.word_index[current_word]
else:
x = np.expand_dims(poem_index, axis=0)
x = pad_sequences(x, maxlen=49, padding='post')
y = model.predict(x)[0, i]
y[0] = 0 #去掉停止词
index = y.argmax()
current_word = tokenizer.index_word[index]
poem_index.append(index)
poem_text = poem_text + current_word
poem_text = poem_text[0:]
print(poem_text[0:5])
print(poem_text[5:10])
print(poem_text[10:15])
print(poem_text[15:20])
它由两层结构组成:前面的sigmoid layer将决定我们更新数据中的哪几个值,后面的tanh layer则会对数据中所有值输出更新值向量$\tilde{C_t}$。
经过上面两步,可以计算当前时刻的长期状态$C_t$
# 数据的输入与展示
import string
import numpy as np
f = open('data/poems_clean.txt', "r", encoding='utf-8')
poems = []
for line in f.readlines():
title, poem = line.split(':')
poem = poem.replace(' ', '') #将空格去掉
poem = poem.replace('\n', '') #将换行符去掉
poems.append(list(poem))
print(poems[0][:])
# 数据整理:文字编码,读入数据并使用`keras`中的`Tokenizer`为我们的语料库建立词典,给每个字分配一个索引。
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poems)
vocab_size = len(tokenizer.word_index) + 1 #加上停止词0
poems_digit = tokenizer.texts_to_sequences(poems)
#为了将所有的诗放在一个M*N的np.array中,将每一首诗补0到同样的长度
poems_digit = pad_sequences(poems_digit, maxlen=50, padding='post')
# 数据整理:数据补全
print("原始诗歌")
print(poems[3864])
print("\n")
print("编码+补全后的结果")
print(poems_digit[3864])
# 生成X和Y
X = poems_digit[:, :-1]
Y = poems_digit[:, 1:]
print("X示例", "\t", "Y示例")
for i in range(10):
print(X[0][i], "\t", Y[0][i])
print("...", "\t", "...")
# 把Y变成One-Hot向量
from tensorflow.keras.utils import to_categorical
Y = to_categorical(Y, num_classes=vocab_size)
print(Y.shape)
# 构建LSTM的模型
# from keras.models import Model
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Activation, BatchNormalization
from tensorflow.keras import Model
hidden_size1 = 300
hidden_size2 = 100
inp = Input(shape=(49,))
# Encoder
x = Embedding(vocab_size, hidden_size1, input_length=49, mask_zero=True)(inp)
x = LSTM(hidden_size2, return_sequences=True)(x)
# prediction
x = Dense(vocab_size)(x)
pred = Activation('softmax')(x)
model = Model(inp, pred)
model.summary()
Embedding层
LSTM层
第一、参考之前LSTM的介绍可以知道,需要参数估计的非线性变幻主要涉及到:$f_t$, $i_t$, $\tilde C_t$, 还有 $o_t$。每个非线性变化所消耗的参数一样。背后主要的原因是:TF要求$h_t$和$c_t$的维度一样(理论上完全可以不一样)。
第二、以$f_t$为例,它作用在$(h_{t-1},x_{t})$上面。由于$h_{t-1}$和$x_{t}$分别是一个64维和128维的向量。加上截距项,需要64+128+1=193个参数。这些参数应用到遗忘门,帮助$C_t$状态更新的时候,需要消耗:193*64=12352个参数。
第三,因为,$f_t$, $i_t$, $\tilde C_t$, 还有$o_t$一共4个非线性变换,而每一个变换所消耗的参数都是12352,因此,最终所需的所有参数是:$12352\times 4=49408$。
dense层
# 模型训练
from tensorflow.keras.optimizers import Adam
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
model.fit(X, Y, epochs=1, batch_size=1000, validation_split=0.2)
model.save('LSTMPoem' + 'Model')
# 应用模型
from tensorflow.keras.models import load_model
model = load_model('LSTMPoem' + 'model')
poem_incomplete = '风****花****雪****月****'
poem_index = []
poem_text = ''
for i in range(len(poem_incomplete)):
current_word = poem_incomplete[i]
if current_word != '*':
index = tokenizer.word_index[current_word]
else:
x = np.expand_dims(poem_index, axis=0)
x = pad_sequences(x, maxlen=49, padding='post')
y = model.predict(x)[0, i]
y[0] = 0 #去掉停止词
index = y.argmax()
current_word = tokenizer.index_word[index]
poem_index.append(index)
poem_text = poem_text + current_word
poem_text = poem_text[0:]
print(poem_text[0:5])
print(poem_text[5:10])
print(poem_text[10:15])
print(poem_text[15:20])