https://github.com/MoyanZitto/keras-cn/blob/master/docs/legacy/blog/word_embedding.md 這個鏈接將帶有embeding層的cnn實現及訓練的過程講的很清楚
構建好帶有embedding層的textcnn模型后,model.fit時傳入的x_train是二維的要訓練的詞對應的標號。下面的代碼會將詞進行標號。
import keras.preprocessing.text as T
from keras.preprocessing.text import Tokenizer
text1 = 'some/thing to eat'
text2 = 'some thing to drink'
texts = [text1, text2]
print(' '.join(text1.split('/')))
tokenizer = Tokenizer(num_words=None) # num_words:None或整數,處理的最大單詞數量。少於此數的單詞丟掉
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(sequence)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=10)
print(data)
print('Found %s unique tokens.' % len(word_index))
print(tokenizer.word_counts) # [('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]
print(tokenizer.word_index) # {'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5}
print(tokenizer.word_docs) # {'some': 2, 'thing': 2, 'to': 2, 'drink': 1, 'eat': 1}
print(tokenizer.index_docs)