# 动手实验 - TensorFlow和TensorBoard自然语言分析

+关注继续查看

import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from collections import namedtuple

train = pd.read_csv("labeledTrainData.tsv", delimiter="\t")
test = pd.read_csv("testData.tsv", delimiter="\t")

# Here's the first review as an example
With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci's character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ's music.<br /><br />Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.<br /><br />Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ's bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i've gave this subject....hmmm well i don't know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter.

def clean_text(text, remove_stopwords=True):
'''Clean the text, with the option to remove stopwords'''

# Convert words to lower case and split them
text = text.lower().split()

# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]

text = " ".join(text)

# Clean the text
text = re.sub(r"<br />", " ", text)
text = re.sub(r"[^a-z]", " ", text)
text = re.sub(r"   ", " ", text) # Remove any extra spaces
text = re.sub(r"  ", " ", text)

# Return a list of words
return(text)

# stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]

# re
text = re.sub(r"<br />", " ", text)
text = re.sub(r"[^a-z]", " ", text)
text = re.sub(r"   ", " ", text) # Remove any extra spaces
text = re.sub(r"  ", " ", text)

re就是我们的正则式。是字符串的一种简化形式。

# Tokenize the reviews
all_reviews = train_clean + test_clean
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_reviews)
print("Fitting is complete.")

train_seq = tokenizer.texts_to_sequences(train_clean)
print("train_seq is complete.")

test_seq = tokenizer.texts_to_sequences(test_clean)
print("test_seq is complete")

word_index = tokenizer.word_index

[445, 86, 489, 10939, 8, 61, 583, 2603, 120, 68, 957, 560, 53, 212, 24485, 212, 17247, 219, 193, 97, 20, 695, 2565, 124, 109, 15, 520, 3954, 193, 27, 246, 654, 2352, 1261, 17247, 90, 4782, 90, 712, 3, 305, 86, 16, 358, 1846, 542, 1219, 3592, 10939, 1, 485, 871, 3538, 23, 526, 673, 1414, 19, 63, 5305, 2089, 1118, 185, 413, 1523, 817, 2583, 7, 10939, 477, 86, 665, 85, 272, 114, 578, 10939, 34480, 29662, 148, 2, 10939, 381, 13, 59, 26, 381, 210, 15, 252, 178, 10, 751, 712, 3, 142, 341, 464, 145, 16427, 4121, 1718, 635, 876, 10547, 1018, 12089, 890, 1067, 1652, 416, 10939, 265, 19, 596, 141, 10939, 18336, 2302, 15821, 876, 10547, 1, 34, 38190, 388, 21, 49, 17539, 1414, 434, 9821, 193, 4238, 10939, 1, 120, 669, 520, 96, 7, 10939, 1555, 444, 2271, 138, 2137, 2383, 635, 23, 72, 117, 4750, 5364, 307, 1326, 31136, 19, 635, 556, 888, 665, 697, 6, 452, 195, 547, 138, 689, 3386, 1234, 790, 56, 1239, 268, 2, 21, 7, 10939, 6, 580, 78, 476, 32, 21, 245, 706, 158, 276, 113, 7674, 673, 3526, 10939, 1, 37925, 1690, 2, 159, 413, 1523, 294, 6, 956, 21, 51, 1500, 1226, 2352, 17, 612, 8, 61, 442, 724, 7184, 17, 25, 4, 49, 21, 199, 443, 3912, 3484, 49, 110, 270, 495, 252, 289, 124, 6, 19622, 19910, 363, 1502]

max_review_length = 200



np.percentile(lengths.counts, 80)

x_train, x_valid, y_train, y_valid = train_test_split(train_pad, train.sentiment, test_size = 0.15, random_state = 2)

def get_batches(x, y, batch_size):
'''Create the batches for the training and validation data'''
n_batches = len(x)//batch_size
x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
def get_test_batches(x, batch_size):
'''Create the batches for the testing data'''
n_batches = len(x)//batch_size
x = x[:n_batches*batch_size]
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size]


def build_rnn(n_words, embed_size, batch_size, lstm_size, num_layers, dropout, learning_rate, multiple_fc, fc_units):
'''Build the Recurrent Neural Network'''

tf.reset_default_graph()

# Declare placeholders we'll feed into the graph
with tf.name_scope('inputs'):
inputs = tf.placeholder(tf.int32, [None, None], name='inputs')

with tf.name_scope('labels'):
labels = tf.placeholder(tf.int32, [None, None], name='labels')

keep_prob = tf.placeholder(tf.float32, name='keep_prob')

# Create the embeddings
with tf.name_scope("embeddings"):
embedding = tf.Variable(tf.random_uniform((n_words,
embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embedding, inputs)

# Build the RNN layers
with tf.name_scope("RNN_layers"):
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
drop = tf.contrib.rnn.DropoutWrapper(lstm,
output_keep_prob=keep_prob)
cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)

# Set the initial state
with tf.name_scope("RNN_init_state"):
initial_state = cell.zero_state(batch_size, tf.float32)

# Run the data through the RNN layers
with tf.name_scope("RNN_forward"):
outputs, final_state = tf.nn.dynamic_rnn(
cell,
embed,
initial_state=initial_state)

# Create the fully connected layers
with tf.name_scope("fully_connected"):

# Initialize the weights and biases
weights = tf.truncated_normal_initializer(stddev=0.1)
biases = tf.zeros_initializer()

dense = tf.contrib.layers.fully_connected(outputs[:, -1],
num_outputs = fc_units,
activation_fn = tf.sigmoid,
weights_initializer = weights,
biases_initializer = biases)

dense = tf.contrib.layers.dropout(dense, keep_prob)

# Depending on the iteration, use a second fully connected
layer
if multiple_fc == True:
dense = tf.contrib.layers.fully_connected(dense,
num_outputs = fc_units,
activation_fn = tf.sigmoid,
weights_initializer = weights,
biases_initializer = biases)

dense = tf.contrib.layers.dropout(dense, keep_prob)

# Make the predictions
with tf.name_scope('predictions'):
predictions = tf.contrib.layers.fully_connected(dense,
num_outputs = 1,
activation_fn=tf.sigmoid,
weights_initializer = weights,
biases_initializer = biases)

tf.summary.histogram('predictions', predictions)

# Calculate the cost
with tf.name_scope('cost'):
cost = tf.losses.mean_squared_error(labels, predictions)
tf.summary.scalar('cost', cost)

# Train the model
with tf.name_scope('train'):
optimizer =

# Determine the accuracy
with tf.name_scope("accuracy"):
correct_pred = tf.equal(tf.cast(tf.round(predictions),
tf.int32),
labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)

# Merge all of the summaries
merged = tf.summary.merge_all()

# Export the nodes
export_nodes = ['inputs', 'labels', 'keep_prob','initial_state',
'final_state','accuracy', 'predictions', 'cost',
'optimizer', 'merged']
Graph = namedtuple('Graph', export_nodes)
local_dict = locals()
graph = Graph(*[local_dict[each] for each in export_nodes])

return graph


tf.reset_default_graph()

# Declare placeholders we'll feed into the graph
with tf.name_scope('inputs'):
inputs = tf.placeholder(tf.int32, [None, None],
name='inputs')

with tf.name_scope('labels'):
labels = tf.placeholder(tf.int32, [None, None],
name='labels')

keep_prob = tf.placeholder(tf.float32, name='keep_prob')


# Create the embeddings
with tf.name_scope("embeddings"):
embedding = tf.Variable(tf.random_uniform((n_words,
embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embedding, inputs)


embedding = tf.Variable(tf.truncated_normal((n_words, embed_size), -0.1, 0.1))

# Build the RNN layers
with tf.name_scope("RNN_layers"):
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
drop = tf.contrib.rnn.DropoutWrapper(lstm,
output_keep_prob=keep_prob)
cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)


# Set the initial state
with tf.name_scope("RNN_init_state"):
initial_state = cell.zero_state(batch_size, tf.float32)


# Run the data through the RNN layers
with tf.name_scope("RNN_forward"):
outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
initial_state=initial_state)


# Create the fully connected layers
with tf.name_scope("fully_connected"):

# Initialize the weights and biases
weights = tf.truncated_normal_initializer(stddev=0.1)
biases = tf.zeros_initializer()

dense = tf.contrib.layers.fully_connected(outputs[:, -1],
num_outputs = fc_units,
activation_fn = tf.sigmoid,
weights_initializer = weights,
biases_initializer = biases)

dense = tf.contrib.layers.dropout(dense, keep_prob)

# Depending on the iteration, use a second fully connected layer
if multiple_fc == True:
dense = tf.contrib.layers.fully_connected(dense,
num_outputs = fc_units,
activation_fn = tf.sigmoid,
weights_initializer = weights,
biases_initializer = biases)

dense = tf.contrib.layers.dropout(dense, keep_prob)


# Make the predictions
with tf.name_scope('predictions'):
predictions = tf.contrib.layers.fully_connected(dense,
num_outputs = 1,
activation_fn = tf.sigmoid,
weights_initializer = weights,
biases_initializer = biases)

tf.summary.histogram('predictions', predictions)


# Calculate the cost
with tf.name_scope('cost'):
cost = tf.losses.mean_squared_error(labels, predictions)
tf.summary.scalar('cost', cost)


# Train the model
with tf.name_scope('train'):


# Determine the accuracy
with tf.name_scope("accuracy"):
correct_pred = tf.equal(tf.cast(tf.round(predictions),
tf.int32),
labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('accuracy', accuracy)


# Merge all of the summaries
merged = tf.summary.merge_all()


# Export the nodes
export_nodes = ['inputs', 'labels', 'keep_prob','initial_state',
'final_state','accuracy', 'predictions', 'cost',
'optimizer', 'merged']
Graph = namedtuple('Graph', export_nodes)
local_dict = locals()
graph = Graph(*[local_dict[each] for each in export_nodes])


def train(model, epochs, log_string):
'''Train the RNN'''

saver = tf.train.Saver()

with tf.Session() as sess:
sess.run(tf.global_variables_initializer())

# Used to determine when to stop the training early
valid_loss_summary = []

# Keep track of which batch iteration is being trained
iteration = 0

print()
print("Training Model: {}".format(log_string))

train_writer = tf.summary.FileWriter('./logs/3/train/{}'.format(log_string), sess.graph)
valid_writer = tf.summary.FileWriter('./logs/3/valid/{}'.format(log_string))

for e in range(epochs):
state = sess.run(model.initial_state)

# Record progress with each epoch
train_loss = []
train_acc = []
val_acc = []
val_loss = []

with tqdm(total=len(x_train)) as pbar:
for _, (x, y) in enumerate(get_batches(x_train,
y_train,
batch_size), 1):
feed = {model.inputs: x,
model.labels: y[:, None],
model.keep_prob: dropout,
model.initial_state: state}
summary, loss, acc, state, _ =
sess.run([model.merged,
model.cost,
model.accuracy,
model.final_state,
model.optimizer],
feed_dict=feed)

# Record the loss and accuracy of each training
batch

train_loss.append(loss)
train_acc.append(acc)

# Record the progress of training

iteration += 1
pbar.update(batch_size)

# Average the training loss and accuracy of each epoch
avg_train_loss = np.mean(train_loss)
avg_train_acc = np.mean(train_acc)

val_state = sess.run(model.initial_state)
with tqdm(total=len(x_valid)) as pbar:
for x, y in get_batches(x_valid,y_valid,batch_size):
feed = {model.inputs: x,
model.labels: y[:, None],
model.keep_prob: 1,
model.initial_state: val_state}
summary, batch_loss, batch_acc, val_state =
sess.run([model.merged,
model.cost,
model.accuracy,
model.final_state],
feed_dict=feed)

# Record the validation loss and accuracy of
each epoch

val_loss.append(batch_loss)
val_acc.append(batch_acc)
pbar.update(batch_size)

# Average the validation loss and accuracy of each epoch
avg_valid_loss = np.mean(val_loss)
avg_valid_acc = np.mean(val_acc)
valid_loss_summary.append(avg_valid_loss)

# Record the validation data's progress

# Print the progress of each epoch
print("Epoch: {}/{}".format(e, epochs),
"Train Loss: {:.3f}".format(avg_train_loss),
"Train Acc: {:.3f}".format(avg_train_acc),
"Valid Loss: {:.3f}".format(avg_valid_loss),
"Valid Acc: {:.3f}".format(avg_valid_acc))

# Stop training if the validation loss does not decrease
after 3 epochs

if avg_valid_loss > min(valid_loss_summary):
print("No Improvement.")
stop_early += 1
if stop_early == 3:
break

# Reset stop_early if the validation loss finds a new low
# Save a checkpoint of the model
else:
print("New Record!")
stop_early = 0
checkpoint ="./sentiment_{}.ckpt".format(log_string)
saver.save(sess, checkpoint)


saver = tf.train.Saver()

train_writer = tf.summary.FileWriter('./logs/3/train/{}'.format(log_string), sess.graph)
valid_writer = tf.summary.FileWriter('./logs/3/valid/{}'.format(log_string))


tqdm() （https://pypi.python.org/pypi/tqdm）则会记录每个过程的时间，这样我就能了解每次过程中还有大致多少剩余时间。

# Record the validation data's progress


# Reset stop_early if the validation loss finds a new low
# Save a checkpoint of the model
else:
print("New Record!")
stop_early = 0
checkpoint = "./sentiment_{}.ckpt".format(log_string)
saver.save(sess, checkpoint)


n_words = len(word_index)
embed_size = 300
batch_size = 250
lstm_size = 128
num_layers = 2
dropout = 0.5
learning_rate = 0.001
epochs = 100
multiple_fc = False
fc_units = 256


·         batch_size: 250是每份数据的量。如果我用了256，那么最终在上传预测结果时就会碰到麻烦。因为最后一个批处理会因为长度不够而不进行预测。

·         epochs: 我常用13来设置这个值。因为我希望能更早的停止迭代而不是已经运行了多次次迭代以后。如果使用一个比较大的值（例如，100），那我们能保证模型已经被完全训练过了。

# Train the model with the desired tuning parameters
for lstm_size in [64,128]:
for multiple_fc in [True, False]:
for fc_units in [128, 256]:
log_string = 'ru={},fcl={},fcu={}'.format(lstm_size,
multiple_fc,
fc_units)
model = build_rnn(n_words = n_words,
embed_size = embed_size,
batch_size = batch_size,
lstm_size = lstm_size,
num_layers = num_layers,
dropout = dropout,
learning_rate = learning_rate,
multiple_fc = multiple_fc,
fc_units = fc_units)
train(model, epochs, log_string)


def make_predictions(lstm_size, multiple_fc, fc_units, checkpoint):
'''Predict the sentiment of the testing data'''

# Record all of the predictions
all_preds = []

model = build_rnn(n_words = n_words,
embed_size = embed_size,
batch_size = batch_size,
lstm_size = lstm_size,
num_layers = num_layers,
dropout = dropout,
learning_rate = learning_rate,
multiple_fc = multiple_fc,
fc_units = fc_units)

with tf.Session() as sess:
saver = tf.train.Saver()
saver.restore(sess, checkpoint)
test_state = sess.run(model.initial_state)
for _, x in enumerate(get_test_batches(x_test,
batch_size), 1):
feed = {model.inputs: x,
model.keep_prob: 1,
model.initial_state: test_state}
predictions = sess.run(model.predictions,feed_dict=feed)
for pred in predictions:
all_preds.append(float(pred))

return all_preds


【翻译】Sklearn与TensorFlow机器学习实用指南 ——第12章 设备和服务器上的分布式TensorFlow（上）

997 0
Tensorflow快餐教程(9) - 卷积

2632 0
Tensorflow MobileNet移动端迁移学习指南
Tensorflow MobileNet移动端迁移学习指南 1。介绍 TensorFlow是用于数值计算的开源库，专门用于机器学习应用程序。笔者： csdn -固本培元  交流邮箱： leo_luopy@139.com  微信：leoluopy 。
2138 0
Tensorflow框架（转载）

700 0
TensorFlow构建循环神经网络

1273 0
TensorFlow中文社区---下载与安装

1813 0
+关注
【方向】

706

5