目錄
- 大綱概述
- 數據集合
- 數據處理
- 預訓練word2vec模型
一、大綱概述
文本分類這個系列將會有8篇左右文章,從github直接下載代碼,從百度雲下載訓練數據,在pycharm上導入即可使用,包括基於word2vec預訓練的文本分類,與及基於近幾年的預訓練模型(ELMo,BERT等)的文本分類。總共有以下系列:
二、數據集合
數據集為IMDB 電影影評,總共有三個數據文件,在/data/rawData目錄下,包括unlabeledTrainData.tsv,labeledTrainData.tsv,testData.tsv。在進行文本分類時需要有標簽的數據(labeledTrainData),但是在訓練word2vec詞向量模型(無監督學習)時可以將無標簽的數據一起用上。
訓練數據地址:鏈接:https://pan.baidu.com/s/1-XEwx1ai8kkGsMagIFKX_g 提取碼:rtz8
三、主要代碼
3.1 配置訓練參數:parameter_config.py
1 # 配置參數
2 class TrainingConfig(object):
3 epoches = 10
4 evaluateEvery = 100
5 checkpointEvery = 100
6 learningRate = 0.001
7
8 class ModelConfig(object):
9 embeddingSize = 200
10 hiddenSizes = [256, 256] # 單層LSTM結構的神經元個數
11 dropoutKeepProb = 0.5
12 l2RegLambda = 0.0
13
14 class Config(object):
15 sequenceLength = 200 # 取了所有序列長度的均值
16 batchSize = 128
17 dataSource = "../data/preProcess/labeledTrain.csv"
18 stopWordSource = "../data/english"
19 numClasses = 1 # 二分類設置為1,多分類設置為類別的數目
20 rate = 0.8 # 訓練集的比例
21 training = TrainingConfig()
22 model = ModelConfig()
23
24 # 實例化配置參數對象
25 # config = Config()
3.2 獲取訓練數據:get_train_data.py
1 # Author:yifan
2 import json
3 from collections import Counter
4 import gensim
5 import pandas as pd
6 import numpy as np
7 import parameter_config
8
9 # 2 數據預處理的類,生成訓練集和測試集
10 # 1)將數據加載進來,將句子分割成詞表示,並去除低頻詞和停用詞。
11 # 2)將詞映射成索引表示,構建詞匯-索引映射表,並保存成json的數據格式,
12 # 之后做inference時可以用到。(注意,有的詞可能不在word2vec的預訓練詞向量中,這種詞直接用UNK表示)
13 # 3)從預訓練的詞向量模型中讀取出詞向量,作為初始化值輸入到模型中。
14 # 4)將數據集分割成訓練集和測試集
15
16 class Dataset(object):
17 def __init__(self, config):
18 self.config = config
19 self._dataSource = config.dataSource
20 self._stopWordSource = config.stopWordSource
21 self._sequenceLength = config.sequenceLength # 每條輸入的序列處理為定長
22 self._embeddingSize = config.model.embeddingSize
23 self._batchSize = config.batchSize
24 self._rate = config.rate
25 self._stopWordDict = {}
26 self.trainReviews = []
27 self.trainLabels = []
28 self.evalReviews = []
29 self.evalLabels = []
30 self.wordEmbedding = None
31 self.labelList = []
32
33 def _readData(self, filePath):
34 """
35 從csv文件中讀取數據集
36 """
37 df = pd.read_csv(filePath)
38 if self.config.numClasses == 1:
39 labels = df["sentiment"].tolist()
40 elif self.config.numClasses > 1:
41 labels = df["rate"].tolist()
42 review = df["review"].tolist()
43 reviews = [line.strip().split() for line in review]
44 return reviews, labels
45
46 def _labelToIndex(self, labels, label2idx):
47 """
48 將標簽轉換成索引表示
49 """
50 labelIds = [label2idx[label] for label in labels]
51 return labelIds
52
53 def _wordToIndex(self, reviews, word2idx):
54 """
55 將詞轉換成索引
56 """
57 reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews]
58 return reviewIds
59
60 def _genTrainEvalData(self, x, y, word2idx, rate):
61 """
62 生成訓練集和驗證集
63 """
64 reviews = []
65 for review in x:
66 if len(review) >= self._sequenceLength:
67 reviews.append(review[:self._sequenceLength])
68 else:
69 reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review)))
70 trainIndex = int(len(x) * rate)
71 trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
72 trainLabels = np.array(y[:trainIndex], dtype="float32")
73 evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
74 evalLabels = np.array(y[trainIndex:], dtype="float32")
75 return trainReviews, trainLabels, evalReviews, evalLabels
76
77 def _getWordEmbedding(self, words):
78 """
79 按照我們的數據集中的單詞取出預訓練好的word2vec中的詞向量
80 """
81 wordVec = gensim.models.KeyedVectors.load_word2vec_format("../word2vec/word2Vec.bin", binary=True)
82 vocab = []
83 wordEmbedding = []
84 # 添加 "pad" 和 "UNK",
85 vocab.append("PAD")
86 vocab.append("UNK")
87 wordEmbedding.append(np.zeros(self._embeddingSize))
88 wordEmbedding.append(np.random.randn(self._embeddingSize))
89
90 for word in words:
91 try:
92 vector = wordVec.wv[word]
93 vocab.append(word)
94 wordEmbedding.append(vector)
95 except:
96 print(word + "不存在於詞向量中")
97
98 return vocab, np.array(wordEmbedding)
99
100 def _genVocabulary(self, reviews, labels):
101 """
102 生成詞向量和詞匯-索引映射字典,可以用全數據集
103 """
104 allWords = [word for review in reviews for word in review]
105
106 # 去掉停用詞
107 subWords = [word for word in allWords if word not in self.stopWordDict]
108 wordCount = Counter(subWords) # 統計詞頻
109 sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
110 # 去除低頻詞
111 words = [item[0] for item in sortWordCount if item[1] >= 5]
112
113 vocab, wordEmbedding = self._getWordEmbedding(words)
114 self.wordEmbedding = wordEmbedding
115 word2idx = dict(zip(vocab, list(range(len(vocab)))))
116
117 uniqueLabel = list(set(labels))
118 label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
119 self.labelList = list(range(len(uniqueLabel)))
120 # 將詞匯-索引映射表保存為json數據,之后做inference時直接加載來處理數據
121 with open("../data/wordJson/word2idx.json", "w", encoding="utf-8") as f:
122 json.dump(word2idx, f)
123
124 with open("../data/wordJson/label2idx.json", "w", encoding="utf-8") as f:
125 json.dump(label2idx, f)
126
127 return word2idx, label2idx
128
129 def _readStopWord(self, stopWordPath):
130 """
131 讀取停用詞
132 """
133
134 with open(stopWordPath, "r") as f:
135 stopWords = f.read()
136 stopWordList = stopWords.splitlines()
137 # 將停用詞用列表的形式生成,之后查找停用詞時會比較快
138 self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
139
140 def dataGen(self):
141 """
142 初始化訓練集和驗證集
143 """
144 # 初始化停用詞
145 self._readStopWord(self._stopWordSource)
146
147 # 初始化數據集
148 reviews, labels = self._readData(self._dataSource)
149
150 # 初始化詞匯-索引映射表和詞向量矩陣
151 word2idx, label2idx = self._genVocabulary(reviews, labels)
152
153 # 將標簽和句子數值化
154 labelIds = self._labelToIndex(labels, label2idx)
155 reviewIds = self._wordToIndex(reviews, word2idx)
156
157 # 初始化訓練集和測試集
158 trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviewIds, labelIds, word2idx,
159 self._rate)
160 self.trainReviews = trainReviews
161 self.trainLabels = trainLabels
162
163 self.evalReviews = evalReviews
164 self.evalLabels = evalLabels
165
166 #獲取前些模塊的數據
167 config =parameter_config.Config()
168 data = Dataset(config)
169 data.dataGen()
3.3 模型構建:mode_structure.py
1 import tensorflow as tf
2 import parameter_config
3 # 3 構建模型 Bi-LSTM模型
4 class BiLSTM(object):
5 """
6 Bi-LSTM 用於文本分類
7 """
8 def __init__(self, config, wordEmbedding):
9 # 定義模型的輸入
10 self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
11 self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
12 self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
13
14 # 定義l2損失
15 l2Loss = tf.constant(0.0)
16
17 # 詞嵌入層
18 with tf.name_scope("embedding"):
19 # 利用預訓練的詞向量初始化詞嵌入矩陣
20 self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
21 # 利用詞嵌入矩陣將輸入的數據中的詞轉換成詞向量,維度[batch_size, sequence_length, embedding_size]
22 self.embeddedWords = tf.nn.embedding_lookup(self.W, self.inputX)
23
24 # 定義兩層雙向LSTM的模型結構
25 with tf.name_scope("Bi-LSTM"):
26 for idx, hiddenSize in enumerate(config.model.hiddenSizes):
27 with tf.name_scope("Bi-LSTM" + str(idx)):
28 # 定義前向LSTM結構
29 lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(
30 tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
31 output_keep_prob=self.dropoutKeepProb)
32 # 定義反向LSTM結構
33 lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(
34 tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
35 output_keep_prob=self.dropoutKeepProb)
36
37 # 采用動態rnn,可以動態的輸入序列的長度,若沒有輸入,則取序列的全長
38 # outputs是一個元祖(output_fw, output_bw),其中兩個元素的維度都是[batch_size, max_time, hidden_size],fw和bw的hidden_size一樣
39 # self.current_state 是最終的狀態,二元組(state_fw, state_bw),state_fw=[batch_size, s],s是一個元祖(h, c)
40 outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell, lstmBwCell,
41 self.embeddedWords, dtype=tf.float32,
42 scope="bi-lstm" + str(idx))
43
44 # 對outputs中的fw和bw的結果拼接 [batch_size, time_step, hidden_size * 2]
45 self.embeddedWords = tf.concat(outputs, 2)
46
47 # 去除最后時間步的輸出作為全連接的輸入
48 finalOutput = self.embeddedWords[:, 0, :]
49
50 outputSize = config.model.hiddenSizes[-1] * 2 # 因為是雙向LSTM,最終的輸出值是fw和bw的拼接,因此要乘以2
51 output = tf.reshape(finalOutput, [-1, outputSize]) # reshape成全連接層的輸入維度
52
53 # 全連接層的輸出
54 with tf.name_scope("output"):
55 outputW = tf.get_variable(
56 "outputW",
57 shape=[outputSize, config.numClasses],
58 initializer=tf.contrib.layers.xavier_initializer())
59
60 outputB = tf.Variable(tf.constant(0.1, shape=[config.numClasses]), name="outputB")
61 l2Loss += tf.nn.l2_loss(outputW)
62 l2Loss += tf.nn.l2_loss(outputB)
63 self.logits = tf.nn.xw_plus_b(output, outputW, outputB, name="logits")
64 if config.numClasses == 1:
65 self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.float32, name="predictions")
66 elif config.numClasses > 1:
67 self.predictions = tf.argmax(self.logits, axis=-1, name="predictions")
68
69 # 計算二元交叉熵損失
70 with tf.name_scope("loss"):
71 if config.numClasses == 1:
72 losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
73 labels=tf.cast(tf.reshape(self.inputY, [-1, 1]),
74 dtype=tf.float32))
75 elif config.numClasses > 1:
76 losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=self.inputY)
77
78 self.loss = tf.reduce_mean(losses) + config.model.l2RegLambda * l2Loss
3.4 模型訓練:mode_trainning.py
import os
import datetime
import numpy as np
import tensorflow as tf
import parameter_config
import get_train_data
import mode_structure
#因為電腦內存較小,只能選擇CPU去訓練了
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
#獲取前些模塊的數據
config =parameter_config.Config()
data = get_train_data.Dataset(config)
data.dataGen()
#4生成batch數據集
def nextBatch(x, y, batchSize):
# 生成batch數據集,用生成器的方式輸出
perm = np.arange(len(x))
np.random.shuffle(perm)
x = x[perm]
y = y[perm]
numBatches = len(x) // batchSize
for i in range(numBatches):
start = i * batchSize
end = start + batchSize
batchX = np.array(x[start: end], dtype="int64")
batchY = np.array(y[start: end], dtype="float32")
yield batchX, batchY
# 5 定義計算metrics的函數
"""
定義各類性能指標
"""
"""
定義各類性能指標
"""
def mean(item: list) -> float:
"""
計算列表中元素的平均值
:param item: 列表對象
:return:
"""
res = sum(item) / len(item) if len(item) > 0 else 0
return res
def accuracy(pred_y, true_y):
"""
計算二類和多類的准確率
:param pred_y: 預測結果
:param true_y: 真實結果
:return:
"""
if isinstance(pred_y[0], list):
pred_y = [item[0] for item in pred_y]
corr = 0
for i in range(len(pred_y)):
if pred_y[i] == true_y[i]:
corr += 1
acc = corr / len(pred_y) if len(pred_y) > 0 else 0
return acc
def binary_precision(pred_y, true_y, positive=1):
"""
二類的精確率計算
:param pred_y: 預測結果
:param true_y: 真實結果
:param positive: 正例的索引表示
:return:
"""
corr = 0
pred_corr = 0
for i in range(len(pred_y)):
if pred_y[i] == positive:
pred_corr += 1
if pred_y[i] == true_y[i]:
corr += 1
prec = corr / pred_corr if pred_corr > 0 else 0
return prec
def binary_recall(pred_y, true_y, positive=1):
"""
二類的召回率
:param pred_y: 預測結果
:param true_y: 真實結果
:param positive: 正例的索引表示
:return:
"""
corr = 0
true_corr = 0
for i in range(len(pred_y)):
if true_y[i] == positive:
true_corr += 1
if pred_y[i] == true_y[i]:
corr += 1
rec = corr / true_corr if true_corr > 0 else 0
return re
def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
"""
二類的f beta值
:param pred_y: 預測結果
:param true_y: 真實結果
:param beta: beta值
:param positive: 正例的索引表示
:return:
"""
precision = binary_precision(pred_y, true_y, positive)
recall = binary_recall(pred_y, true_y, positive)
try:
f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
except:
f_b = 0
return f_b
def multi_precision(pred_y, true_y, labels):
"""
多類的精確率
:param pred_y: 預測結果
:param true_y: 真實結果
:param labels: 標簽列表
:return:
"""
if isinstance(pred_y[0], list):
pred_y = [item[0] for item in pred_y]
precisions = [binary_precision(pred_y, true_y, label) for label in labels]
prec = mean(precisions)
return prec
def multi_recall(pred_y, true_y, labels):
"""
多類的召回率
:param pred_y: 預測結果
:param true_y: 真實結果
:param labels: 標簽列表
:return:
"""
if isinstance(pred_y[0], list):
pred_y = [item[0] for item in pred_y]
recalls = [binary_recall(pred_y, true_y, label) for label in labels]
rec = mean(recalls)
return rec
def multi_f_beta(pred_y, true_y, labels, beta=1.0):
"""
多類的f beta值
:param pred_y: 預測結果
:param true_y: 真實結果
:param labels: 標簽列表
:param beta: beta值
:return:
"""
if isinstance(pred_y[0], list):
pred_y = [item[0] for item in pred_y]
f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
f_beta = mean(f_betas)
return f_beta
def get_binary_metrics(pred_y, true_y, f_beta=1.0):
"""
得到二分類的性能指標
:param pred_y:
:param true_y:
:param f_beta:
:return:
"""
acc = accuracy(pred_y, true_y)
recall = binary_recall(pred_y, true_y)
precision = binary_precision(pred_y, true_y)
f_beta = binary_f_beta(pred_y, true_y, f_beta)
return acc, recall, precision, f_beta
def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
"""
得到多分類的性能指標
:param pred_y:
:param true_y:
:param labels:
:param f_beta:
:return:
"""
acc = accuracy(pred_y, true_y)
recall = multi_recall(pred_y, true_y, labels)
precision = multi_precision(pred_y, true_y, labels)
f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
return acc, recall, precision, f_beta
# 6 訓練模型
# 生成訓練集和驗證集
trainReviews = data.trainReviews
trainLabels = data.trainLabels
evalReviews = data.evalReviews
evalLabels = data.evalLabels
wordEmbedding = data.wordEmbedding
labelList = data.labelList
# 定義計算圖
with tf.Graph().as_default():
session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
session_conf.gpu_options.allow_growth = True
session_conf.gpu_options.per_process_gpu_memory_fraction = 0.9 # 配置gpu占用率
sess = tf.Session(config=session_conf)
# 定義會話
with sess.as_default():
lstm = mode_structure.BiLSTM(config, wordEmbedding)
globalStep = tf.Variable(0, name="globalStep", trainable=False)
# 定義優化函數,傳入學習速率參數
optimizer = tf.train.AdamOptimizer(config.training.learningRate)
# 計算梯度,得到梯度和變量
gradsAndVars = optimizer.compute_gradients(lstm.loss)
# 將梯度應用到變量下,生成訓練器
trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
# 用summary繪制tensorBoard
gradSummaries = []
for g, v in gradsAndVars:
if g is not None:
tf.summary.histogram("{}/grad/hist".format(v.name), g)
tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
outDir = os.path.abspath(os.path.join(os.path.curdir, "summarys"))
print("Writing to {}\n".format(outDir))
lossSummary = tf.summary.scalar("loss", lstm.loss)
summaryOp = tf.summary.merge_all()
trainSummaryDir = os.path.join(outDir, "train")
trainSummaryWriter = tf.summary.FileWriter(trainSummaryDir, sess.graph)
evalSummaryDir = os.path.join(outDir, "eval")
evalSummaryWriter = tf.summary.FileWriter(evalSummaryDir, sess.graph)
# 初始化所有變量
saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
# 保存模型的一種方式,保存為pb文件
savedModelPath = "../model/Bi-LSTM/savedModel"
if os.path.exists(savedModelPath):
os.rmdir(savedModelPath)
builder = tf.saved_model.builder.SavedModelBuilder(savedModelPath)
sess.run(tf.global_variables_initializer())
def trainStep(batchX, batchY):
"""
訓練函數
"""
feed_dict = {
lstm.inputX: batchX,
lstm.inputY: batchY,
lstm.dropoutKeepProb: config.model.dropoutKeepProb
}
_, summary, step, loss, predictions = sess.run(
[trainOp, summaryOp, globalStep, lstm.loss, lstm.predictions],
feed_dict)
timeStr = datetime.datetime.now().isoformat()
if config.numClasses == 1:
acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
elif config.numClasses > 1:
acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY,
labels=labelList)
trainSummaryWriter.add_summary(summary, step)
return loss, acc, prec, recall, f_beta
def devStep(batchX, batchY):
"""
驗證函數
"""
feed_dict = {
lstm.inputX: batchX,
lstm.inputY: batchY,
lstm.dropoutKeepProb: 1.0
}
summary, step, loss, predictions = sess.run(
[summaryOp, globalStep, lstm.loss, lstm.predictions],
feed_dict)
if config.numClasses == 1:
acc, precision, recall, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
elif config.numClasses > 1:
acc, precision, recall, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
evalSummaryWriter.add_summary(summary, step)
return loss, acc, precision, recall, f_beta
for i in range(config.training.epoches):
# 訓練模型
print("start training model")
for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize):
loss, acc, prec, recall, f_beta = trainStep(batchTrain[0], batchTrain[1])
currentStep = tf.train.global_step(sess, globalStep)
print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
currentStep, loss, acc, recall, prec, f_beta))
if currentStep % config.training.evaluateEvery == 0:
print("\nEvaluation:")
losses = []
accs = []
f_betas = []
precisions = []
recalls = []
for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize):
loss, acc, precision, recall, f_beta = devStep(batchEval[0], batchEval[1])
losses.append(loss)
accs.append(acc)
f_betas.append(f_beta)
precisions.append(precision)
recalls.append(recall)
time_str = datetime.datetime.now().isoformat()
print("{}, step: {}, loss: {}, acc: {},precision: {}, recall: {}, f_beta: {}".format(time_str,
currentStep,
mean(losses),
mean(accs),
mean(
precisions),
mean(recalls),
mean(f_betas)))
if currentStep % config.training.checkpointEvery == 0:
# 保存模型的另一種方法,保存checkpoint文件
path = saver.save(sess, "../model/Bi-LSTM/model/my-model", global_step=currentStep)
print("Saved model checkpoint to {}\n".format(path))
inputs = {"inputX": tf.saved_model.utils.build_tensor_info(lstm.inputX),
"keepProb": tf.saved_model.utils.build_tensor_info(lstm.dropoutKeepProb)}
outputs = {"predictions": tf.saved_model.utils.build_tensor_info(lstm.predictions)}#這里應該是lstm.binaryPreds。
prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, outputs=outputs,
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op")
builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map={"predict": prediction_signature},
legacy_init_op=legacy_init_op)
builder.save()
3.5 預測:predict.py
1 # Author:yifan
2 import os
3 import csv
4 import time
5 import datetime
6 import random
7 import json
8 from collections import Counter
9 from math import sqrt
10 import gensim
11 import pandas as pd
12 import numpy as np
13 import tensorflow as tf
14 from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
15 import parameter_config
16 config =parameter_config.Config()
17
18 #7預測代碼
19 x = "this movie is full of references like mad max ii the wild one and many others the ladybug´s face it´s a clear reference or tribute to peter lorre this movie is a masterpiece we´ll talk much more about in the future"
20
21 # 注:下面兩個詞典要保證和當前加載的模型對應的詞典是一致的
22 with open("../data/wordJson/word2idx.json", "r", encoding="utf-8") as f:
23 word2idx = json.load(f)
24
25 with open("../data/wordJson/label2idx.json", "r", encoding="utf-8") as f:
26 label2idx = json.load(f)
27 idx2label = {value: key for key, value in label2idx.items()}
28
29 xIds = [word2idx.get(item, word2idx["UNK"]) for item in x.split(" ")]
30 if len(xIds) >= config.sequenceLength:
31 xIds = xIds[:config.sequenceLength]
32 else:
33 xIds = xIds + [word2idx["PAD"]] * (config.sequenceLength - len(xIds))
34
35 graph = tf.Graph()
36 with graph.as_default():
37 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
38 session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False, gpu_options=gpu_options)
39 sess = tf.Session(config=session_conf)
40
41 with sess.as_default():
42 checkpoint_file = tf.train.latest_checkpoint("../model/Bi-LSTM/model/")
43 saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
44 saver.restore(sess, checkpoint_file)
45
46 # 獲得需要喂給模型的參數,輸出的結果依賴的輸入值
47 inputX = graph.get_operation_by_name("inputX").outputs[0]
48 dropoutKeepProb = graph.get_operation_by_name("dropoutKeepProb").outputs[0]
49
50 # 獲得輸出的結果
51 predictions = graph.get_tensor_by_name("output/predictions:0")
52
53 pred = sess.run(predictions, feed_dict={inputX: [xIds], dropoutKeepProb: 1.0})[0]
54
55 # print(pred)
56 pred = [idx2label[item] for item in pred]
57 print(pred)
結果

相關代碼可見:https://github.com/yifanhunter/NLP_textClassifier
