Pytorch-中文文本分類


摘抄

1. 爬取京東商品評論

JD.py

import requests
from urllib.parse import quote
from urllib.parse import urlencode
from lxml import etree
import logging
import json
import time

class JDSpider:
    # 爬蟲實現類:傳入商品類別(如手機、電腦),構造實例。然后調用getData爬取數據
    def __init__(self, categlory):
        self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (quote(categlory))  # jD起始搜索頁面
        self.commentBaseUrl = "https://club.jd.com/comment/productPageComments.action?"
        self.headers = {
            "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
        }
        self.productsId = self.getId()
        self.comtype = {0: "nagetive", 1: "medium", 2: "positive"}
        self.categlory = categlory
        self.iplist = {
            'http': [],
            'https': []
        }

    def getParamUrl(self, productid, page, score):
        # 用於控制頁數,頁面信息數的數據,非常重要,必不可少,否則會被JD識別出來,爬不出相應的數據。
        params = {     
            "productId" : "%s" % (productid),
            "score": "%s" % (score),         # 1: 差評, 2: 中評, 3: 好評
            "page": "%s" % (page),
            "sortType": "5",
            "pageSize": "10",
            "isShadowSku": "0",
            "rid": "0",
            "fold": "1"
        }
        url = self.commentBaseUrl + urlencode(params) 
        return params, url

    # 和初始的self.header不同,爬取某個商品的header,加入了商品id
    def getHeaders(self, productid):
        header = {
            "Referer": "https://item.jd.com/%s.html" % (productid),
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
        }
        return header

    # 獲取商品id,為了得到具體商品頁面的網址,結果保持在self.productId的數組里
    def getId(self):
        response = requests.get(self.startUrl, headers=self.headers)
        
        if response.status_code != 200:
            logging.warning("狀態碼錯誤,爬蟲異常!")

        html = etree.HTML(response.text) 
        return html.xpath('//li[@class="gl-item"]/@data-sku')

    # maxPage是爬取評論的最大頁數,每頁10條數據
    def getData(self, maxPage, score, ): 
        # 差評 和 好評 的 最大一般頁碼不相同,一般情況下:好評 >> 差評 > 中評
        # score是指 那種評價類型: 好評3、中評2、差評1
        comments = []
        scores = []

        for j in range(len(self.productsId)):
            id = self.productsId[j]
            header = self.getHeaders(id)

            for i in range(1, maxPage):
                param, url = self.getParamUrl(id, i, score)          
                print(">>>>>>>>>>>>>>>>第:%d 個,第 %d 頁" % (j, i))
                
                try:
                    response = requests.get(url, headers=header, params=param)
                except Exception as e:
                    logging.warning(e)
                    break
                
                if response.status_code != 200:
                    logging.warning("狀態碼錯誤,爬蟲連接異常")
                    continue
                time.sleep(2)  # 設置時延
                if response.text == '':
                    logging.warning("未爬取到信息")
                    continue
                try:
                    res_json = json.loads(response.text)
                except Exception as e:
                    logging.warning(e)
                    continue
                if len((res_json['comments'])) == 0:
                    logging.warning("頁面次數已到:%d,超出范圍" % (i))
                    break
                logging.info("正在爬取%s %s 第 %d" % (self.categlory, self.comtype[score], i))
                for cdit in res_json['comments']:
                    comment = cdit['content'].replace("\n", ' ').replace('\r', ' ')
                    comments.append(comment)
                    scores.append(cdit['score'])
                    print(comment)

        savepath = './data/' + self.categlory + '_' + self.comtype[score] + '.csv'
        logging.warning("已爬取%d 條 %s 評價信息" % (len(comments), self.comtype[score]))
        with open(savepath, 'a+', encoding='utf8') as f:
            for i in range(len(comments)):
                f.write("%d\t%s\t%s\n" % (i, scores[i], comments[i]))
        logging.warning("數據已保存在 %s" % (savepath))


if __name__=='__main__':

    list = ['電腦','手機','耳機']
    for item in list:
        spider = JDSpider(item)
        spider.getData(10, 2)     # 好評
        spider.getData(10, 1)     # 中評
        spider.getData(10, 0)     # 差評

list列表中是傳入的商品類別(如手機、電腦),其中getData的參數是 (maxPage, score)

  1. maxPage是爬取評論的最大頁數,每頁10條數據。差評和好評的最大一般頁碼不相同,一般情況下:好評>>差評>中評

  2. maxPage遇到超出的頁碼會自動跳出,所以設大點也沒有關系。

  3. score是指那種評價類型,好評2、中評1、差評0。

運行JD.py,爬取下來的文件存在data/目錄下。

之后運行ProcessData.py將原始數據集文件進行划分,按8:1:1的比例划分為 訓練集、數據集和測試集,並將划分后的數據集以csv格式存在dataset/目錄下。

ProcessData.py

import os
import random
import pandas as pd

data_list = []
file_dir = "data/"
all_csv_list = os.listdir(file_dir)

for single_csv in all_csv_list:
    with open(os.path.join(file_dir, single_csv), encoding="utf-8") as file:
        for line in file:
            label = line.replace('\n', '').split('\t')[1]
            if (int(label) < 2):   # 0: negative
                label = 0 
            elif (int(label) > 4): # 2: positive
                label = 2 
            else:
                label = 1
            sentence = line.replace('\n', '').split('\t')[2]
            data_list.append([sentence, label])
    
random.shuffle(data_list)
# 將全部語料按 1:1:8分為測試集,驗證集 與訓練集
n = len(data_list) // 10
test_list = data_list[:n]
dev_list = data_list[n : n*2]
train_list = data_list[n*2 : ]

print('訓練集數量: {}'.format(str(len(train_list))))
print('驗證集數量: {}'.format(str(len(dev_list))))
print('測試集數量: {}'.format(str(len(test_list))))

name = ['Sentence', "Label"]

csv_train = pd.DataFrame(columns=name, data=train_list)
csv_train.to_csv('dataset/csv_train.csv', encoding='utf8', index=False)

csv_dev = pd.DataFrame(columns=name, data=dev_list)
csv_dev.to_csv('dataset/csv_dev.csv', encoding='utf8', index=False)

csv_test = pd.DataFrame(columns=name, data=test_list)
csv_test.to_csv('dataset/csv_test.csv', encoding='utf8', index=False)

2. TorchText處理數據

  • 對文本做 sequential、token、use_vocab處理, fix_length的操作,fix_length使用此字段的所有示例都將填充到的固定長度

  • build_vocab 和 划分iter了。

DataSet.py

import torch
from torchtext import data  
import jieba
import re 

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

def x_tokenize(x):
    str1 = re.sub('[^\u4e00-\u9fa5]', "", x)
    return jieba.lcut(str1)     # print(x_tokenize('你是誰'))

# sequential: True 是否是序列數據,如果不是就不使用tokenization
# use_vocab: True 是否使用 a Vocab object. 如果不使用的話,原始數據應已是數字類型.
# fix_length: 設置序列數據的定長
# tokenize: string.split 對原始數據進行字符串操作,eg. 輸入tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=x_tokenize, fix_length=100, 
                  use_vocab=True)

LABEL = data.Field(sequential=False,
                   use_vocab=False)

train, dev, test = data.TabularDataset.splits(path='dataset', 
                                              train='csv_train.csv',
                                              validation='csv_dev.csv', 
                                              test='csv_test.csv',
                                              format='csv',
                                              skip_header=True,
                                              csv_reader_params={'delimiter' : ','},
                                              fields=[('text', TEXT), ('label', LABEL)])
TEXT.build_vocab(train)

train_iter, val_iter, test_iter = data.BucketIterator.splits((train, dev, test),
                                                              batch_size = 256,
                                                              shuffle = True,
                                                              sort = False,
                                                              sort_within_batch = False,
                                                              repeat = False)

def getTEXT():
    return TEXT

def getLabel():
    return LABEL

def getIter():
    return train_iter, val_iter, test_iter

3. 構建模型

  • [x] TextCNN
  • [x] TextRNN
  • [x] TextRNN+Attention
  • [x] Transformer
  • [x] TextRCNN
  • [ ] Some other attention

模型定義在 model/ 目錄下,在forward最后返回的out的shape: [batch size, num_classes]

transformer講解

Transformer.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import sys
sys.path.append('./')
import DataSet

# device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
# Input Embedding: H = e + p
class Position_Encoding(nn.Module):
    def __init__(self, embed, pad_size, dropout, device):
        super(Position_Encoding, self).__init__()
        self.device = device
        self.pe = torch.tensor(
            # P_(pos, 2i):第pos個位置的編碼向量的第2i維
            # embed: 編碼向量的維度,pos: 表示第pos個位置
            [[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)]   
        )
        self.pe[:, 0::2] = torch.sin(self.pe[:, 0::2])
        self.pe[:, 1::2] = torch.cos(self.pe[:, 1::2])
        self.dropout = nn.Dropout(dropout)
        # print(self.pe.shape)    # [pad_size, embed]

    def forward(self, x):
        out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
        out = self.dropout(out)
        return out                # [pad_size, embed]

# 一個Attention
class Scaled_Dot_Product_Attention(nn.Module):
    ''' Scaled Dot-Product Attention '''
    def __init__(self):
        super(Scaled_Dot_Product_Attention, self).__init__()
    
    def forward(self, Q, K, V, scale=None, mask=None):
        """
        Args:
            Q: [batch_size, len_Q, dim_Q]
            K: [batch_size, len_K, dim_K]
            V: [batch_size, len_V, dim_V]
            scale: 縮放因子 論文為根號 dim_X
        Return:
            self-attention后的張量,以及attention張量
        """
        attention = torch.matmul(Q, K.transpose(-2, -1))  # [seq_len, dim_h]·[dim_h, seq_len] = [seq_len, seq_len] 
        if scale:
            attention = attention * scale         # attention / math.sqrt(D_k)
        if mask:
            attention = attention.masked_fill_(mask == 0, -1e9)
       
        attention = F.softmax(attention, dim=-1)  # [batch_size, h, seq_len, seq_len]       
        context = torch.matmul(attention, V)      # [batch_size, h, seq_len, dim_head]

        return context

# Multi-Head Attention
class Multi_Head_Attention(nn.Module):
    def __init__(self, embedding_dim, num_head, dropout=0.0):
        super(Multi_Head_Attention, self).__init__()
        assert embedding_dim % num_head == 0
        
        # head的數量
        self.num_head = num_head                      
        # 將 embedding_dim 分割成 h份的 維度
        self.dim_head = embedding_dim // self.num_head
        # fc_Q, fc_K, fc_V, fc(最后一層fc) 
        self.linears = self.clones(nn.Linear(embedding_dim, embedding_dim), 4) # embedding_dim = self.dim_head * self.num_head
        self.attention = Scaled_Dot_Product_Attention()
        self.dropout = nn.Dropout(p = dropout)
        self.layer_norm = nn.LayerNorm(embedding_dim)

    def clones(self, module, N = 4):
        return nn.ModuleList(copy.deepcopy(module) for _ in range(N))

    def forward(self, x, mask = None):

        batch_size = x.size(0)
            
        # 1. Do all the linear projections(線性預測) in batch from embeddding_dim => h x d_k
        # [batch, seq_len, num_head, dim_head] -> [batch, num_head, seq_len, dim_head]
        query, key, value = [l(x).view(batch_size, -1, self.num_head, self.dim_head).transpose(1, 2)
                             for l in self.linears[:3]]
        if mask:
            mask = mask.unsqueeze(1)   # [batch, seq_len, 1]
        scale = key.size(-1) ** -0.5   # 縮放因子, / sqrt(D_k)
        
        # 2. Apply attention on all the projected vectors in batch.
        # atten: [batch, num_head, seq_len, dim_head]
        context = self.attention(query, key, value, scale=scale, mask=mask)

        # [batch, seq_len, emb_dim]
        context = context.transpose(1,2).contiguous().view(batch_size, -1, self.dim_head * self.num_head) 
        out = self.linears[-1](context)       # [batch, seq_len, emb_dim]
        out = self.dropout(out)
        out = out + x                         # 殘差連接
        out = self.layer_norm(out)            # layerNorm
        return out 

# Feed Forward + Add + LayerNorm
class Position_wise_Feed_Forward(nn.Module):
    def __init__(self, embedding_dim, output_size, dropout=0.0):
        super(Position_wise_Feed_Forward, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, output_size)
        self.fc2 = nn.Linear(output_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        out = self.dropout(out)
        out = out + x               # 殘差連接
        out = self.layer_norm(out)  # LayerNorm
        return out

# Encoder
class Encoder(nn.Module):
    def __init__(self, embedding_dim, num_head, output_size, dropout):
        super(Encoder, self).__init__()
        self.attention = Multi_Head_Attention(embedding_dim, num_head, dropout)
        self.feed_forward = Position_wise_Feed_Forward(embedding_dim, output_size, dropout)

    def forward(self, x):
        out = self.attention(x)
        out = self.feed_forward(out)
        return out

# Transformer
class Transformer(nn.Module):
    def __init__(self,
                 vocab_size = len(DataSet.getTEXT().vocab),   # 詞典的大小
                 seq_len = 100,
                 n_class = 3,       # 分類的類型
                 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
                 embed_dim = 300,   # embedding的維度
                 dropout = 0.5, 
                 num_head = 5,      # Multi-Head的數量
                 output_size = 1024,     
                 num_encoder = 2,   # 編碼數量
                 ):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # H: e + p
        self.postion_embedding = Position_Encoding(embed_dim, seq_len, dropout, device)
        # Multi-Head Attention + Add + Norm
        self.encoder = Encoder(embed_dim, num_head, output_size, dropout)
        self.encoders = nn.ModuleList([
            copy.deepcopy(self.encoder) for _ in range(num_encoder)
        ])
        # 輸出
        self.fc1 = nn.Linear(seq_len * embed_dim, n_class)

    def forward(self, x):
        out = self.embedding(x)
        out = self.postion_embedding(out)
        for encoder in self.encoders:
            out = encoder(out)          
        out = out.view(out.size(0), -1) # [batch size, seq_len * embed_dim]
        out = self.fc1(out)             # [batch size, n_class]
        return out 

4. 運行模型

train.py

import torch
import torch.nn.functional as F
from torch import nn, optim
import DataSet
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time

# from model.TextCNN import TextCNN
# from model.TextRCNN import TextRCNN
# from model.TextRNN import TextRNN
# from model.TextRNN_Attention import TextRNN_Attention
from model.Transformer import Transformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'transformer'

model = Transformer().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train_iter, dev_iter, test_iter = DataSet.getIter()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)  

def binary_acc(preds, target):
    preds = torch.argmax(preds, dim = 1)
    correct = (preds == target)
    acc = correct.sum().item() / len(correct)
    return acc

def train_model(model, train_iter, optimizer, criterion):
    # 進入訓練模式
    model.train()

    avg_loss = []
    avg_acc = []
    
    print('trainng..............')

    for batch in train_iter:
        feature = torch.t(batch.text)
        target = batch.label
        # 預測
        pred = model(feature)
        # 代價函數
        loss = criterion(pred, target)
        acc = binary_acc(pred, target)

        avg_loss.append(loss.item())
        avg_acc.append(acc)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc

# 評估函數    
def evaluate_model(model, iterator, criterion):
    avg_loss = []
    avg_acc = []
    model.eval()     # 表示進入測試

    with torch.no_grad():
        for batch in iterator:
            feature = torch.t(batch.text)
            target = batch.label
            # 預測
            pred = model(feature)
            # 代價函數
            loss = criterion(pred, target)
            acc = binary_acc(pred, target)

            avg_loss.append(loss.item())
            avg_acc.append(acc)

    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()
    return avg_loss, avg_acc

def test_model(model, iterator, criterion):
    avg_loss = []
    avg_acc = []
    model.eval()     # 表示進入測試

    y_true = []
    y_pred = []

    with torch.no_grad():
        for batch in iterator:
            feature = torch.t(batch.text)
            target = batch.label
            # 預測
            pred = model(feature)
            # 代價函數
            loss = criterion(pred, target)
            acc = binary_acc(pred, target)

            avg_loss.append(loss.item())
            avg_acc.append(acc)

            y_true.extend(target.cpu().numpy())
            y_pred.extend(torch.argmax(pred, dim = 1).cpu().numpy())

    avg_loss = np.array(avg_loss).mean()
    avg_acc = np.array(avg_acc).mean()

    score = accuracy_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    print(conf_matrix)

    target_names = ['差評', '中評', '好評']
    print(classification_report(y_true, y_pred, target_names=target_names)) 

    return avg_loss, avg_acc

def saveModel(model, name):
    torch.save(model.state_dict(), 'done_model/' + name + '_model.pt')

def loadModel(model, name):
    model.load_state_dict(torch.load('done_model/' + name + '_model.pt', map_location=device))

def train():
    best_valid_acc = float('-inf')
    for epoch in range(10):
        start_time = time.time()

        train_loss, train_acc = train_model(model, train_iter, optimizer, criterion)
        dev_loss, dev_acc = evaluate_model(model, dev_iter, criterion)

        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        if dev_acc > best_valid_acc:    # 只要模型效果好,就保存
            best_valid_acc = dev_acc
            saveModel(model, MODEL_NAME)
        
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs:.2f}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {dev_loss:.3f} |  Val. Acc: {dev_acc*100:.2f}%')

def test():
    loadModel(model, MODEL_NAME)
    test_loss, test_acc = test_model(model, test_iter, criterion)
    print(f'Test. Loss: {test_loss:.3f} |  Test. Acc: {test_acc*100:.2f}%')

def predict():
    sent1 = '垃圾,這個東西最好別買'

    demo = [data.Example.fromlist(data=[sent1,0], 
                                 fields=[('text', DataSet.getTEXT()), ('label',DataSet.getLabel())])]

    demo_iter = data.BucketIterator(dataset = data.Dataset(demo,
                                                        [('text',DataSet.getTEXT()), ('label',DataSet.getLabel())]), 
                                                        batch_size = 256, 
                                                        shuffle = True,
                                                        sort_key = lambda x:len(x.text), 
                                                        sort_within_batch = False, 
                                                        device = device,
                                                        repeat = False)
    for batch in demo_iter:
        feature = batch.text
        target = batch.label

        feature = torch.t(feature)

        out = model(feature)
        if torch.argmax(out, dim=1).item() == 0:
            print('差評')
        elif torch.argmax(out, dim=1).item() == 2:
            print('好評')
        else:
            print('中評')


if __name__=='__main__':
    
    train()
    test()
    predict()

trainng..............
Epoch: 01 | Epoch Time: 0.0m 20.97s
	Train Loss: 1.593 | Train Acc: 62.34%
	 Val. Loss: 0.663 |  Val. Acc: 75.34%
trainng..............
Epoch: 02 | Epoch Time: 0.0m 20.81s
	Train Loss: 0.582 | Train Acc: 74.62%
	 Val. Loss: 0.562 |  Val. Acc: 80.54%
trainng..............
Epoch: 03 | Epoch Time: 0.0m 20.82s
	Train Loss: 0.523 | Train Acc: 77.62%
	 Val. Loss: 0.465 |  Val. Acc: 82.23%
trainng..............
Epoch: 04 | Epoch Time: 0.0m 20.81s
	Train Loss: 0.480 | Train Acc: 79.32%
	 Val. Loss: 0.529 |  Val. Acc: 81.80%
trainng..............
Epoch: 05 | Epoch Time: 0.0m 20.84s
	Train Loss: 0.490 | Train Acc: 79.36%
	 Val. Loss: 0.461 |  Val. Acc: 81.96%
trainng..............
Epoch: 06 | Epoch Time: 0.0m 20.77s
	Train Loss: 0.427 | Train Acc: 81.62%
	 Val. Loss: 0.442 |  Val. Acc: 82.23%
trainng..............
Epoch: 07 | Epoch Time: 0.0m 20.79s
	Train Loss: 0.416 | Train Acc: 82.37%
	 Val. Loss: 0.491 |  Val. Acc: 81.87%
trainng..............
Epoch: 08 | Epoch Time: 0.0m 20.82s
	Train Loss: 0.372 | Train Acc: 83.98%
	 Val. Loss: 0.447 |  Val. Acc: 83.88%
trainng..............
Epoch: 09 | Epoch Time: 0.0m 20.83s
	Train Loss: 0.364 | Train Acc: 84.26%
	 Val. Loss: 0.433 |  Val. Acc: 85.02%
trainng..............
Epoch: 10 | Epoch Time: 0.0m 20.79s
	Train Loss: 0.346 | Train Acc: 84.89%
	 Val. Loss: 0.483 |  Val. Acc: 82.18%
[[591  95   9]
 [192 460  61]
 [  4  19 731]]
              precision    recall  f1-score   support

          差評       0.75      0.85      0.80       695
          中評       0.80      0.65      0.71       713
          好評       0.91      0.97      0.94       754

    accuracy                           0.82      2162
   macro avg       0.82      0.82      0.82      2162
weighted avg       0.82      0.82      0.82      2162

Test. Loss: 0.528 |  Test. Acc: 82.37%

差評


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM