1

TensorFlow:文本分类基本流程

 2 years ago
source link: https://ylhao.github.io/2018/09/06/96/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

TensorFlow:文本分类基本流程

创建时间:2018-09-06 12:24
字数:450 阅读:81
#encoding: utf-8
import tensorflow as tf
from tensorflow import keras
import numpy as np
print(tf.__version__)


"""
下载数据集
"""
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)


"""
查看数据集
"""
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
print(train_data[0])
print(len(train_data[0]), len(train_data[1]))


"""
将数字转为单词
"""
word_index = imdb.get_word_index()
print(type(word_index))
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])  # 翻转词典

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(train_data[0]))


"""
padding
"""
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                        maxlen=256)

print(len(train_data[0]), len(train_data[1]))
print(train_data[0])


"""
建模
"""
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss='binary_crossentropy',
              metrics=['accuracy'])


"""
划分出验证集、训练集
"""
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]


"""
训练
"""
# model.fit() returns a History object that contains a dictionary with everything that happened during training
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=60,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)  # verbose:日志显示,0为不在标准输出流输出日志信息,1为输出进度条记录,2为每个epoch输出一行记录


"""
评估模型
"""
results = model.evaluate(test_data, test_labels)
print(results)

history_dict = history.history
print(history_dict.keys())


"""
画图
"""
import matplotlib.pyplot as plt

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)

# bo: 蓝色的点
plt.plot(epochs, loss, 'bo', label='Training loss')
# b: 蓝色的实线
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()  # 图例
plt.savefig("text-classification_loss.png")

plt.clf()   # clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig("text-classification_acc.png")

运行结果与网络结构


转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达,可以在文章下方的评论区进行评论,也可以邮件至 [email protected]

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK