1. 爬取京東商品評論
JD.py
import requests
from urllib.parse import quote
from urllib.parse import urlencode
from lxml import etree
import logging
import json
import time
class JDSpider:
# 爬蟲實現類:傳入商品類別(如手機、電腦),構造實例。然后調用getData爬取數據
def __init__(self, categlory):
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (quote(categlory)) # jD起始搜索頁面
self.commentBaseUrl = "https://club.jd.com/comment/productPageComments.action?"
self.headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
self.productsId = self.getId()
self.comtype = {0: "nagetive", 1: "medium", 2: "positive"}
self.categlory = categlory
self.iplist = {
'http': [],
'https': []
}
def getParamUrl(self, productid, page, score):
# 用於控制頁數,頁面信息數的數據,非常重要,必不可少,否則會被JD識別出來,爬不出相應的數據。
params = {
"productId" : "%s" % (productid),
"score": "%s" % (score), # 1: 差評, 2: 中評, 3: 好評
"page": "%s" % (page),
"sortType": "5",
"pageSize": "10",
"isShadowSku": "0",
"rid": "0",
"fold": "1"
}
url = self.commentBaseUrl + urlencode(params)
return params, url
# 和初始的self.header不同,爬取某個商品的header,加入了商品id
def getHeaders(self, productid):
header = {
"Referer": "https://item.jd.com/%s.html" % (productid),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
return header
# 獲取商品id,為了得到具體商品頁面的網址,結果保持在self.productId的數組里
def getId(self):
response = requests.get(self.startUrl, headers=self.headers)
if response.status_code != 200:
logging.warning("狀態碼錯誤,爬蟲異常!")
html = etree.HTML(response.text)
return html.xpath('//li[@class="gl-item"]/@data-sku')
# maxPage是爬取評論的最大頁數,每頁10條數據
def getData(self, maxPage, score, ):
# 差評 和 好評 的 最大一般頁碼不相同,一般情況下:好評 >> 差評 > 中評
# score是指 那種評價類型: 好評3、中評2、差評1
comments = []
scores = []
for j in range(len(self.productsId)):
id = self.productsId[j]
header = self.getHeaders(id)
for i in range(1, maxPage):
param, url = self.getParamUrl(id, i, score)
print(">>>>>>>>>>>>>>>>第:%d 個,第 %d 頁" % (j, i))
try:
response = requests.get(url, headers=header, params=param)
except Exception as e:
logging.warning(e)
break
if response.status_code != 200:
logging.warning("狀態碼錯誤,爬蟲連接異常")
continue
time.sleep(2) # 設置時延
if response.text == '':
logging.warning("未爬取到信息")
continue
try:
res_json = json.loads(response.text)
except Exception as e:
logging.warning(e)
continue
if len((res_json['comments'])) == 0:
logging.warning("頁面次數已到:%d,超出范圍" % (i))
break
logging.info("正在爬取%s %s 第 %d" % (self.categlory, self.comtype[score], i))
for cdit in res_json['comments']:
comment = cdit['content'].replace("\n", ' ').replace('\r', ' ')
comments.append(comment)
scores.append(cdit['score'])
print(comment)
savepath = './data/' + self.categlory + '_' + self.comtype[score] + '.csv'
logging.warning("已爬取%d 條 %s 評價信息" % (len(comments), self.comtype[score]))
with open(savepath, 'a+', encoding='utf8') as f:
for i in range(len(comments)):
f.write("%d\t%s\t%s\n" % (i, scores[i], comments[i]))
logging.warning("數據已保存在 %s" % (savepath))
if __name__=='__main__':
list = ['電腦','手機','耳機']
for item in list:
spider = JDSpider(item)
spider.getData(10, 2) # 好評
spider.getData(10, 1) # 中評
spider.getData(10, 0) # 差評
list列表中是傳入的商品類別(如手機、電腦),其中getData的參數是 (maxPage, score)
-
maxPage是爬取評論的最大頁數,每頁10條數據。差評和好評的最大一般頁碼不相同,一般情況下:好評>>差評>中評
-
maxPage遇到超出的頁碼會自動跳出,所以設大點也沒有關系。
-
score是指那種評價類型,好評2、中評1、差評0。
運行JD.py,爬取下來的文件存在data/目錄下。
之后運行ProcessData.py將原始數據集文件進行划分,按8:1:1的比例划分為 訓練集、數據集和測試集,並將划分后的數據集以csv格式存在dataset/目錄下。
ProcessData.py
import os
import random
import pandas as pd
data_list = []
file_dir = "data/"
all_csv_list = os.listdir(file_dir)
for single_csv in all_csv_list:
with open(os.path.join(file_dir, single_csv), encoding="utf-8") as file:
for line in file:
label = line.replace('\n', '').split('\t')[1]
if (int(label) < 2): # 0: negative
label = 0
elif (int(label) > 4): # 2: positive
label = 2
else:
label = 1
sentence = line.replace('\n', '').split('\t')[2]
data_list.append([sentence, label])
random.shuffle(data_list)
# 將全部語料按 1:1:8分為測試集,驗證集 與訓練集
n = len(data_list) // 10
test_list = data_list[:n]
dev_list = data_list[n : n*2]
train_list = data_list[n*2 : ]
print('訓練集數量: {}'.format(str(len(train_list))))
print('驗證集數量: {}'.format(str(len(dev_list))))
print('測試集數量: {}'.format(str(len(test_list))))
name = ['Sentence', "Label"]
csv_train = pd.DataFrame(columns=name, data=train_list)
csv_train.to_csv('dataset/csv_train.csv', encoding='utf8', index=False)
csv_dev = pd.DataFrame(columns=name, data=dev_list)
csv_dev.to_csv('dataset/csv_dev.csv', encoding='utf8', index=False)
csv_test = pd.DataFrame(columns=name, data=test_list)
csv_test.to_csv('dataset/csv_test.csv', encoding='utf8', index=False)
2. TorchText處理數據
-
對文本做 sequential、token、use_vocab處理, fix_length的操作,fix_length使用此字段的所有示例都將填充到的固定長度
-
build_vocab 和 划分iter了。
DataSet.py
import torch
from torchtext import data
import jieba
import re
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
def x_tokenize(x):
str1 = re.sub('[^\u4e00-\u9fa5]', "", x)
return jieba.lcut(str1) # print(x_tokenize('你是誰'))
# sequential: True 是否是序列數據,如果不是就不使用tokenization
# use_vocab: True 是否使用 a Vocab object. 如果不使用的話,原始數據應已是數字類型.
# fix_length: 設置序列數據的定長
# tokenize: string.split 對原始數據進行字符串操作,eg. 輸入tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=x_tokenize, fix_length=100,
use_vocab=True)
LABEL = data.Field(sequential=False,
use_vocab=False)
train, dev, test = data.TabularDataset.splits(path='dataset',
train='csv_train.csv',
validation='csv_dev.csv',
test='csv_test.csv',
format='csv',
skip_header=True,
csv_reader_params={'delimiter' : ','},
fields=[('text', TEXT), ('label', LABEL)])
TEXT.build_vocab(train)
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, dev, test),
batch_size = 256,
shuffle = True,
sort = False,
sort_within_batch = False,
repeat = False)
def getTEXT():
return TEXT
def getLabel():
return LABEL
def getIter():
return train_iter, val_iter, test_iter
3. 構建模型
- [x] TextCNN
- [x] TextRNN
- [x] TextRNN+Attention
- [x] Transformer
- [x] TextRCNN
- [ ] Some other attention
模型定義在 model/ 目錄下,在forward最后返回的out的shape: [batch size, num_classes]
Transformer.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import sys
sys.path.append('./')
import DataSet
# device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
# Input Embedding: H = e + p
class Position_Encoding(nn.Module):
def __init__(self, embed, pad_size, dropout, device):
super(Position_Encoding, self).__init__()
self.device = device
self.pe = torch.tensor(
# P_(pos, 2i):第pos個位置的編碼向量的第2i維
# embed: 編碼向量的維度,pos: 表示第pos個位置
[[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)]
)
self.pe[:, 0::2] = torch.sin(self.pe[:, 0::2])
self.pe[:, 1::2] = torch.cos(self.pe[:, 1::2])
self.dropout = nn.Dropout(dropout)
# print(self.pe.shape) # [pad_size, embed]
def forward(self, x):
out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
out = self.dropout(out)
return out # [pad_size, embed]
# 一個Attention
class Scaled_Dot_Product_Attention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self):
super(Scaled_Dot_Product_Attention, self).__init__()
def forward(self, Q, K, V, scale=None, mask=None):
"""
Args:
Q: [batch_size, len_Q, dim_Q]
K: [batch_size, len_K, dim_K]
V: [batch_size, len_V, dim_V]
scale: 縮放因子 論文為根號 dim_X
Return:
self-attention后的張量,以及attention張量
"""
attention = torch.matmul(Q, K.transpose(-2, -1)) # [seq_len, dim_h]·[dim_h, seq_len] = [seq_len, seq_len]
if scale:
attention = attention * scale # attention / math.sqrt(D_k)
if mask:
attention = attention.masked_fill_(mask == 0, -1e9)
attention = F.softmax(attention, dim=-1) # [batch_size, h, seq_len, seq_len]
context = torch.matmul(attention, V) # [batch_size, h, seq_len, dim_head]
return context
# Multi-Head Attention
class Multi_Head_Attention(nn.Module):
def __init__(self, embedding_dim, num_head, dropout=0.0):
super(Multi_Head_Attention, self).__init__()
assert embedding_dim % num_head == 0
# head的數量
self.num_head = num_head
# 將 embedding_dim 分割成 h份的 維度
self.dim_head = embedding_dim // self.num_head
# fc_Q, fc_K, fc_V, fc(最后一層fc)
self.linears = self.clones(nn.Linear(embedding_dim, embedding_dim), 4) # embedding_dim = self.dim_head * self.num_head
self.attention = Scaled_Dot_Product_Attention()
self.dropout = nn.Dropout(p = dropout)
self.layer_norm = nn.LayerNorm(embedding_dim)
def clones(self, module, N = 4):
return nn.ModuleList(copy.deepcopy(module) for _ in range(N))
def forward(self, x, mask = None):
batch_size = x.size(0)
# 1. Do all the linear projections(線性預測) in batch from embeddding_dim => h x d_k
# [batch, seq_len, num_head, dim_head] -> [batch, num_head, seq_len, dim_head]
query, key, value = [l(x).view(batch_size, -1, self.num_head, self.dim_head).transpose(1, 2)
for l in self.linears[:3]]
if mask:
mask = mask.unsqueeze(1) # [batch, seq_len, 1]
scale = key.size(-1) ** -0.5 # 縮放因子, / sqrt(D_k)
# 2. Apply attention on all the projected vectors in batch.
# atten: [batch, num_head, seq_len, dim_head]
context = self.attention(query, key, value, scale=scale, mask=mask)
# [batch, seq_len, emb_dim]
context = context.transpose(1,2).contiguous().view(batch_size, -1, self.dim_head * self.num_head)
out = self.linears[-1](context) # [batch, seq_len, emb_dim]
out = self.dropout(out)
out = out + x # 殘差連接
out = self.layer_norm(out) # layerNorm
return out
# Feed Forward + Add + LayerNorm
class Position_wise_Feed_Forward(nn.Module):
def __init__(self, embedding_dim, output_size, dropout=0.0):
super(Position_wise_Feed_Forward, self).__init__()
self.fc1 = nn.Linear(embedding_dim, output_size)
self.fc2 = nn.Linear(output_size, embedding_dim)
self.dropout = nn.Dropout(dropout)
self.layer_norm = nn.LayerNorm(embedding_dim)
def forward(self, x):
out = self.fc1(x)
out = F.relu(out)
out = self.fc2(out)
out = self.dropout(out)
out = out + x # 殘差連接
out = self.layer_norm(out) # LayerNorm
return out
# Encoder
class Encoder(nn.Module):
def __init__(self, embedding_dim, num_head, output_size, dropout):
super(Encoder, self).__init__()
self.attention = Multi_Head_Attention(embedding_dim, num_head, dropout)
self.feed_forward = Position_wise_Feed_Forward(embedding_dim, output_size, dropout)
def forward(self, x):
out = self.attention(x)
out = self.feed_forward(out)
return out
# Transformer
class Transformer(nn.Module):
def __init__(self,
vocab_size = len(DataSet.getTEXT().vocab), # 詞典的大小
seq_len = 100,
n_class = 3, # 分類的類型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
embed_dim = 300, # embedding的維度
dropout = 0.5,
num_head = 5, # Multi-Head的數量
output_size = 1024,
num_encoder = 2, # 編碼數量
):
super(Transformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
# H: e + p
self.postion_embedding = Position_Encoding(embed_dim, seq_len, dropout, device)
# Multi-Head Attention + Add + Norm
self.encoder = Encoder(embed_dim, num_head, output_size, dropout)
self.encoders = nn.ModuleList([
copy.deepcopy(self.encoder) for _ in range(num_encoder)
])
# 輸出
self.fc1 = nn.Linear(seq_len * embed_dim, n_class)
def forward(self, x):
out = self.embedding(x)
out = self.postion_embedding(out)
for encoder in self.encoders:
out = encoder(out)
out = out.view(out.size(0), -1) # [batch size, seq_len * embed_dim]
out = self.fc1(out) # [batch size, n_class]
return out
4. 運行模型
train.py
import torch
import torch.nn.functional as F
from torch import nn, optim
import DataSet
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
# from model.TextCNN import TextCNN
# from model.TextRCNN import TextRCNN
# from model.TextRNN import TextRNN
# from model.TextRNN_Attention import TextRNN_Attention
from model.Transformer import Transformer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'transformer'
model = Transformer().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train_iter, dev_iter, test_iter = DataSet.getIter()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def binary_acc(preds, target):
preds = torch.argmax(preds, dim = 1)
correct = (preds == target)
acc = correct.sum().item() / len(correct)
return acc
def train_model(model, train_iter, optimizer, criterion):
# 進入訓練模式
model.train()
avg_loss = []
avg_acc = []
print('trainng..............')
for batch in train_iter:
feature = torch.t(batch.text)
target = batch.label
# 預測
pred = model(feature)
# 代價函數
loss = criterion(pred, target)
acc = binary_acc(pred, target)
avg_loss.append(loss.item())
avg_acc.append(acc)
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_acc = np.array(avg_acc).mean()
avg_loss = np.array(avg_loss).mean()
return avg_loss, avg_acc
# 評估函數
def evaluate_model(model, iterator, criterion):
avg_loss = []
avg_acc = []
model.eval() # 表示進入測試
with torch.no_grad():
for batch in iterator:
feature = torch.t(batch.text)
target = batch.label
# 預測
pred = model(feature)
# 代價函數
loss = criterion(pred, target)
acc = binary_acc(pred, target)
avg_loss.append(loss.item())
avg_acc.append(acc)
avg_loss = np.array(avg_loss).mean()
avg_acc = np.array(avg_acc).mean()
return avg_loss, avg_acc
def test_model(model, iterator, criterion):
avg_loss = []
avg_acc = []
model.eval() # 表示進入測試
y_true = []
y_pred = []
with torch.no_grad():
for batch in iterator:
feature = torch.t(batch.text)
target = batch.label
# 預測
pred = model(feature)
# 代價函數
loss = criterion(pred, target)
acc = binary_acc(pred, target)
avg_loss.append(loss.item())
avg_acc.append(acc)
y_true.extend(target.cpu().numpy())
y_pred.extend(torch.argmax(pred, dim = 1).cpu().numpy())
avg_loss = np.array(avg_loss).mean()
avg_acc = np.array(avg_acc).mean()
score = accuracy_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
print(conf_matrix)
target_names = ['差評', '中評', '好評']
print(classification_report(y_true, y_pred, target_names=target_names))
return avg_loss, avg_acc
def saveModel(model, name):
torch.save(model.state_dict(), 'done_model/' + name + '_model.pt')
def loadModel(model, name):
model.load_state_dict(torch.load('done_model/' + name + '_model.pt', map_location=device))
def train():
best_valid_acc = float('-inf')
for epoch in range(10):
start_time = time.time()
train_loss, train_acc = train_model(model, train_iter, optimizer, criterion)
dev_loss, dev_acc = evaluate_model(model, dev_iter, criterion)
end_time = time.time()
epoch_mins, epoch_secs = divmod(end_time - start_time, 60)
if dev_acc > best_valid_acc: # 只要模型效果好,就保存
best_valid_acc = dev_acc
saveModel(model, MODEL_NAME)
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs:.2f}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {dev_loss:.3f} | Val. Acc: {dev_acc*100:.2f}%')
def test():
loadModel(model, MODEL_NAME)
test_loss, test_acc = test_model(model, test_iter, criterion)
print(f'Test. Loss: {test_loss:.3f} | Test. Acc: {test_acc*100:.2f}%')
def predict():
sent1 = '垃圾,這個東西最好別買'
demo = [data.Example.fromlist(data=[sent1,0],
fields=[('text', DataSet.getTEXT()), ('label',DataSet.getLabel())])]
demo_iter = data.BucketIterator(dataset = data.Dataset(demo,
[('text',DataSet.getTEXT()), ('label',DataSet.getLabel())]),
batch_size = 256,
shuffle = True,
sort_key = lambda x:len(x.text),
sort_within_batch = False,
device = device,
repeat = False)
for batch in demo_iter:
feature = batch.text
target = batch.label
feature = torch.t(feature)
out = model(feature)
if torch.argmax(out, dim=1).item() == 0:
print('差評')
elif torch.argmax(out, dim=1).item() == 2:
print('好評')
else:
print('中評')
if __name__=='__main__':
train()
test()
predict()
trainng..............
Epoch: 01 | Epoch Time: 0.0m 20.97s
Train Loss: 1.593 | Train Acc: 62.34%
Val. Loss: 0.663 | Val. Acc: 75.34%
trainng..............
Epoch: 02 | Epoch Time: 0.0m 20.81s
Train Loss: 0.582 | Train Acc: 74.62%
Val. Loss: 0.562 | Val. Acc: 80.54%
trainng..............
Epoch: 03 | Epoch Time: 0.0m 20.82s
Train Loss: 0.523 | Train Acc: 77.62%
Val. Loss: 0.465 | Val. Acc: 82.23%
trainng..............
Epoch: 04 | Epoch Time: 0.0m 20.81s
Train Loss: 0.480 | Train Acc: 79.32%
Val. Loss: 0.529 | Val. Acc: 81.80%
trainng..............
Epoch: 05 | Epoch Time: 0.0m 20.84s
Train Loss: 0.490 | Train Acc: 79.36%
Val. Loss: 0.461 | Val. Acc: 81.96%
trainng..............
Epoch: 06 | Epoch Time: 0.0m 20.77s
Train Loss: 0.427 | Train Acc: 81.62%
Val. Loss: 0.442 | Val. Acc: 82.23%
trainng..............
Epoch: 07 | Epoch Time: 0.0m 20.79s
Train Loss: 0.416 | Train Acc: 82.37%
Val. Loss: 0.491 | Val. Acc: 81.87%
trainng..............
Epoch: 08 | Epoch Time: 0.0m 20.82s
Train Loss: 0.372 | Train Acc: 83.98%
Val. Loss: 0.447 | Val. Acc: 83.88%
trainng..............
Epoch: 09 | Epoch Time: 0.0m 20.83s
Train Loss: 0.364 | Train Acc: 84.26%
Val. Loss: 0.433 | Val. Acc: 85.02%
trainng..............
Epoch: 10 | Epoch Time: 0.0m 20.79s
Train Loss: 0.346 | Train Acc: 84.89%
Val. Loss: 0.483 | Val. Acc: 82.18%
[[591 95 9]
[192 460 61]
[ 4 19 731]]
precision recall f1-score support
差評 0.75 0.85 0.80 695
中評 0.80 0.65 0.71 713
好評 0.91 0.97 0.94 754
accuracy 0.82 2162
macro avg 0.82 0.82 0.82 2162
weighted avg 0.82 0.82 0.82 2162
Test. Loss: 0.528 | Test. Acc: 82.37%
差評