Python爬蟲實踐~BeautifulSoup+urllib+Flask實現靜態網頁的爬取

本文轉載自查看原文 2019-07-06 10:30 467 爬蟲/ BeautifulSoup/ Python

爬取的網站類型：

論壇類網站類型

涉及主要的第三方模塊：

BeautifulSoup：解析、遍歷頁面

urllib：處理URL請求

Flask：簡易的WEB框架

介紹：

本次主要使用urllib獲取網頁數據，然后通過BeautifulSoup進行頁面解析，返回json結構的數據。

功能點：

urllib根據URL通過GET方式獲取網頁內容；
通過JSON文件配置
解析頁面結構，返回JSON結構的數據
提供REST服務進行調用

特點：

1、提供數據描述服務，總頁面，每頁條數，總條數；

2、增量請求數據，只獲取當前時間和上次請求時間之間的數據；

3、控制請求時間間隔，防治IP被封殺

4、分頁數據請求

5、修改請求關鍵字，並記錄上次的請求關鍵字

主要代碼結構：

- 公共請求端封裝

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
from urllib import request
from urllib.parse import quote
import string
import requests

# 靜態頁面基類
class StaticBase():
    # 獲取網頁內容
    def __getHTMLText(self,url,code="utf-8"):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = code
            return r.text
        except:
            return ""
    # get方式請求數據
    def getUrl(self,url,code='utf-8'):
        url = quote(url,safe=string.printable)
        req = request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
        with request.urlopen(req) as f:
            print('Status:',f.status,f.reason)
            return f.read().decode(code)

#s = StaticBase()
#print(s.getUrl('http://www.baidu.com/','utf-8'))

- 配置文件

{
   "host": "http://shangyu.108sq.cn", 
   "base_url": "http://shangyu.108sq.cn/shuo/search?sertype=4", 
   "key_words": "污染", 
   "page_size": 30, 
   "search_key": "", 
   "last_request_time": 1562142204.149511, 
   "request_gap": 60
  }

- 解析服務

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from datetime import datetime
import json
from common.staticBase import StaticBase

class Shangyh108(StaticBase):

    __config_file = "config.json"   #配置文件
    __text = ""                     #解析的網頁內容
    __config_dict = {}              #配置集合
    __url = ""                      #請求的URL
    __keywords = ""                 #關鍵字
    __last_request_time = 0         #上次請求時間
    # 構造函數
    def __init__(self):
        self.__config_dict = self.__getConfig()
        # 查詢關鍵字
        if len(self.__config_dict['search_key']) >0 :
            self.__keywords = self.__config_dict['search_key']
        else:
            self.__keywords = self.__config_dict['key_words']
        self.__url = self.__getUrl()
    # 獲取網頁內容
    def getText(self):
        print(self.__url)
        self.__text = StaticBase.getUrl(self,self.__url)

    # 獲取第一頁內容
    def getFirstPageText(self,url=''):
        if self.checkRquestTime():
            if len(url)==0 :
                url = self.__getUrl()
            self.__text = StaticBase.getUrl(self,url)
            return self.parseHTML()
        else:
            print("操作頻繁請稍后重新")
    # 獲取下一頁
    def getNextPage(self,url):
        url = self.__config_dict['host']+url
        print(url)
        self.__text = StaticBase.getUrl(self,url)
        return self.parseHTML()

    # 為防止請求對服務器造成太大壓力，控制請求的時間間隔，最少為5分鍾
    def checkRquestTime(self):
        request_gap = self.__config_dict['request_gap']
        last_request_time = self.__config_dict['last_request_time']
        dt_now = datetime.now().timestamp()
        self.__last_request_time = last_request_time  # 記錄上次請求的時間，為了獲取階段性的數據
        if last_request_time == 0:  #第一次請求，直接通過
            last_request_time = dt_now
        elif last_request_time+request_gap > dt_now:
            print("請求過度頻繁，請稍后重試")
            return False
        else:
            last_request_time = dt_now

        self.__setConfig('last_request_time',last_request_time)
        return True

    # 獲取網頁描述信息
    def getDesc(self):
        self.getText()
        soup = BeautifulSoup(self.__text,'html.parser')
        obj_count = soup.select('.count')[0]
        count_str = str(obj_count.string).replace("(共","").replace("條)","")
        count = int(count_str)
        pageSize = int(self.__config_dict['page_size'])
        host = self.__config_dict['host']
        if count % pageSize == 0 :
            pages = count//pageSize
        else:
            pages = count // pageSize + 1
        desc = {}
        desc['host'] = host
        desc['count'] = count
        desc['page_size'] = pageSize
        desc['total_page'] = pages
        # 增加分頁的URL
        if pages > 0 :
            pageUrls = soup.select(".TCPage__middle > a")
            page_url = []
            for i in range(len(pageUrls)-1) :
                tag = pageUrls[i+1]
                page_url.append(tag['href'])
            desc['page_url'] = page_url
        return json.dumps(desc,ensure_ascii=False)

    # 解析網頁內容
    def parseHTML(self):
        soup = BeautifulSoup(self.__text, 'html.parser')
        list_li = soup.select('.TCSayList .TCSayList_li')
        data_list = []
        for i in range(len(list_li)):
            item = {}
            temp = list_li[i]
            publish_time = temp.select('.TCSayList_li_time')[0]
            int_dt = int(publish_time['data-time'])

            if self.__last_request_time == 0 or self.__last_request_time < int_dt :
                # 發布時間
                item['publish_time_long'] = publish_time['data-time']
                item['publish_time_str'] = datetime.fromtimestamp(int(publish_time['data-time'])).strftime(
                    '%Y-%m-%d %H:%M:%S')

                # 數據標簽
                item['data-tag'] = temp['data-tag']
                # 用戶
                author = temp.select('.TCSayList_li_author')[0]
                item['author_name'] = author.string
                item['author_url'] = author['href']

                # 標題
                if len(temp.select('.TCSayList__title a')) >0:
                    title = temp.select('.TCSayList__title a')[0]
                    item['title'] = title.string
                    item['link_url'] = title['href']
                # 內容
                item['content'] = temp.select('.TCSayList_li_content')[0]['data-short']

                data_list.append(item)
        return data_list
    # 獲取請求配置信息
    def __getConfig(self):
        with open(self.__config_file, "r",encoding="utf-8") as load_f:
            load_dict = json.load(load_f)
        return load_dict
    # 設置配置項
    def __setConfig(self,key,value):
        self.__config_dict[key] = value
        print(self.__config_dict)
        with open(self.__config_file,'w',encoding="utf-8") as f:
            f.write(json.dumps(self.__config_dict,ensure_ascii=False))

    def getKeywords(self):
        self.__keywords = input("請輸入查詢的關鍵字，多個關鍵字用“+”連接，默認關鍵字：環保+污染+投訴，使用默認關鍵字可直接按Enter:")
        if len(self.__keywords) == 0:
            self.__keywords = self.__config_dict['key_words']
        else:
            self.__setConfig("search_key",self.__keywords)

    # 獲取請求的URL
    def __getUrl(self):
        base_url = self.__config_dict['base_url']
        # 組裝查詢參數
        url = base_url + "&key=" + self.__keywords
        return url

- REST服務端（目前服務沒有暴露很多，但相應服務實現都已經實現）

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
from flask import Flask,jsonify,abort,request
from changshuo108.shangyu.Shangyh108 import Shangyh108

app = Flask(__name__)
# 支持中文
app.config['JSON_AS_ASCII'] = False
shangyue = Shangyh108()
@app.route('/shangyu108/api/desc',methods=['GET'])
def get_desc():
    return shangyue.getDesc()

@app.route('/shangyu108/api/first_page',methods=['GET'])
def get_firstPage():
    return jsonify({'data':shangyue.getFirstPageText('')})

@app.route('/shangyu108/api/page',methods=['POST'])
def get_article():
   if not request.json or not 'url' in request.json:
      abort(400)
   print(request.json['url'])
   return jsonify({'data':shangyue.getNextPage(request.json['url'])})

if __name__ == '__main__':
    app.run(debug=True)