爬取的網站類型:
論壇類網站類型
涉及主要的第三方模塊:
BeautifulSoup:解析、遍歷頁面
urllib:處理URL請求
Flask:簡易的WEB框架
介紹:
本次主要使用urllib獲取網頁數據,然后通過BeautifulSoup進行頁面解析,返回json結構的數據。
功能點:
-
urllib根據URL通過GET方式獲取網頁內容;
-
通過JSON文件配置
-
解析頁面結構,返回JSON結構的數據
-
提供REST服務進行調用
特點:
1、提供數據描述服務,總頁面,每頁條數,總條數;
2、增量請求數據,只獲取當前時間和上次請求時間之間的數據;
3、控制請求時間間隔,防治IP被封殺
4、分頁數據請求
5、修改請求關鍵字,並記錄上次的請求關鍵字
主要代碼結構:
- 公共請求端封裝
#!/usr/bin/env python # -*- coding:utf-8 -*- from urllib import request from urllib.parse import quote import string import requests # 靜態頁面基類 class StaticBase(): # 獲取網頁內容 def __getHTMLText(self,url,code="utf-8"): try: r = requests.get(url) r.raise_for_status() r.encoding = code return r.text except: return "" # get方式請求數據 def getUrl(self,url,code='utf-8'): url = quote(url,safe=string.printable) req = request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36') with request.urlopen(req) as f: print('Status:',f.status,f.reason) return f.read().decode(code) #s = StaticBase() #print(s.getUrl('http://www.baidu.com/','utf-8'))
- 配置文件
{ "host": "http://shangyu.108sq.cn", "base_url": "http://shangyu.108sq.cn/shuo/search?sertype=4", "key_words": "污染", "page_size": 30, "search_key": "", "last_request_time": 1562142204.149511, "request_gap": 60 }
- 解析服務
#!/usr/bin/env python # -*- coding:utf-8 -*- from bs4 import BeautifulSoup from datetime import datetime import json from common.staticBase import StaticBase class Shangyh108(StaticBase): __config_file = "config.json" #配置文件 __text = "" #解析的網頁內容 __config_dict = {} #配置集合 __url = "" #請求的URL __keywords = "" #關鍵字 __last_request_time = 0 #上次請求時間 # 構造函數 def __init__(self): self.__config_dict = self.__getConfig() # 查詢關鍵字 if len(self.__config_dict['search_key']) >0 : self.__keywords = self.__config_dict['search_key'] else: self.__keywords = self.__config_dict['key_words'] self.__url = self.__getUrl() # 獲取網頁內容 def getText(self): print(self.__url) self.__text = StaticBase.getUrl(self,self.__url) # 獲取第一頁內容 def getFirstPageText(self,url=''): if self.checkRquestTime(): if len(url)==0 : url = self.__getUrl() self.__text = StaticBase.getUrl(self,url) return self.parseHTML() else: print("操作頻繁請稍后重新") # 獲取下一頁 def getNextPage(self,url): url = self.__config_dict['host']+url print(url) self.__text = StaticBase.getUrl(self,url) return self.parseHTML() # 為防止請求對服務器造成太大壓力,控制請求的時間間隔,最少為5分鍾 def checkRquestTime(self): request_gap = self.__config_dict['request_gap'] last_request_time = self.__config_dict['last_request_time'] dt_now = datetime.now().timestamp() self.__last_request_time = last_request_time # 記錄上次請求的時間,為了獲取階段性的數據 if last_request_time == 0: #第一次請求,直接通過 last_request_time = dt_now elif last_request_time+request_gap > dt_now: print("請求過度頻繁,請稍后重試") return False else: last_request_time = dt_now self.__setConfig('last_request_time',last_request_time) return True # 獲取網頁描述信息 def getDesc(self): self.getText() soup = BeautifulSoup(self.__text,'html.parser') obj_count = soup.select('.count')[0] count_str = str(obj_count.string).replace("(共","").replace("條)","") count = int(count_str) pageSize = int(self.__config_dict['page_size']) host = self.__config_dict['host'] if count % pageSize == 0 : pages = count//pageSize else: pages = count // pageSize + 1 desc = {} desc['host'] = host desc['count'] = count desc['page_size'] = pageSize desc['total_page'] = pages # 增加分頁的URL if pages > 0 : pageUrls = soup.select(".TCPage__middle > a") page_url = [] for i in range(len(pageUrls)-1) : tag = pageUrls[i+1] page_url.append(tag['href']) desc['page_url'] = page_url return json.dumps(desc,ensure_ascii=False) # 解析網頁內容 def parseHTML(self): soup = BeautifulSoup(self.__text, 'html.parser') list_li = soup.select('.TCSayList .TCSayList_li') data_list = [] for i in range(len(list_li)): item = {} temp = list_li[i] publish_time = temp.select('.TCSayList_li_time')[0] int_dt = int(publish_time['data-time']) if self.__last_request_time == 0 or self.__last_request_time < int_dt : # 發布時間 item['publish_time_long'] = publish_time['data-time'] item['publish_time_str'] = datetime.fromtimestamp(int(publish_time['data-time'])).strftime( '%Y-%m-%d %H:%M:%S') # 數據標簽 item['data-tag'] = temp['data-tag'] # 用戶 author = temp.select('.TCSayList_li_author')[0] item['author_name'] = author.string item['author_url'] = author['href'] # 標題 if len(temp.select('.TCSayList__title a')) >0: title = temp.select('.TCSayList__title a')[0] item['title'] = title.string item['link_url'] = title['href'] # 內容 item['content'] = temp.select('.TCSayList_li_content')[0]['data-short'] data_list.append(item) return data_list # 獲取請求配置信息 def __getConfig(self): with open(self.__config_file, "r",encoding="utf-8") as load_f: load_dict = json.load(load_f) return load_dict # 設置配置項 def __setConfig(self,key,value): self.__config_dict[key] = value print(self.__config_dict) with open(self.__config_file,'w',encoding="utf-8") as f: f.write(json.dumps(self.__config_dict,ensure_ascii=False)) def getKeywords(self): self.__keywords = input("請輸入查詢的關鍵字,多個關鍵字用“+”連接,默認關鍵字:環保+污染+投訴,使用默認關鍵字可直接按Enter:") if len(self.__keywords) == 0: self.__keywords = self.__config_dict['key_words'] else: self.__setConfig("search_key",self.__keywords) # 獲取請求的URL def __getUrl(self): base_url = self.__config_dict['base_url'] # 組裝查詢參數 url = base_url + "&key=" + self.__keywords return url
- REST服務端(目前服務沒有暴露很多,但相應服務實現都已經實現)
#!/usr/bin/env python # -*- coding:utf-8 -*- from flask import Flask,jsonify,abort,request from changshuo108.shangyu.Shangyh108 import Shangyh108 app = Flask(__name__) # 支持中文 app.config['JSON_AS_ASCII'] = False shangyue = Shangyh108() @app.route('/shangyu108/api/desc',methods=['GET']) def get_desc(): return shangyue.getDesc() @app.route('/shangyu108/api/first_page',methods=['GET']) def get_firstPage(): return jsonify({'data':shangyue.getFirstPageText('')}) @app.route('/shangyu108/api/page',methods=['POST']) def get_article(): if not request.json or not 'url' in request.json: abort(400) print(request.json['url']) return jsonify({'data':shangyue.getNextPage(request.json['url'])}) if __name__ == '__main__': app.run(debug=True)
爬取的網站類型:
論壇類網站類型
涉及主要的第三方模塊:
BeautifulSoup:解析、遍歷頁面
urllib:處理URL請求
Flask:簡易的WEB框架
介紹:
本次主要使用urllib獲取網頁數據,然后通過BeautifulSoup進行頁面解析,返回json結構的數據。
功能點:
-
urllib根據URL通過GET方式獲取網頁內容;
-
通過JSON文件配置
-
解析頁面結構,返回JSON結構的數據
-
提供REST服務進行調用
特點:
1、提供數據描述服務,總頁面,每頁條數,總條數;
2、增量請求數據,只獲取當前時間和上次請求時間之間的數據;
3、控制請求時間間隔,防治IP被封殺
4、分頁數據請求
5、修改請求關鍵字,並記錄上次的請求關鍵字
主要代碼結構:
- 公共請求端封裝
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from urllib import request
from urllib.parse import quote
import string
import requests
# 靜態頁面基類
class StaticBase():
# 獲取網頁內容 使用的requests 庫實現
def __getHTMLText(self,url,code="utf-8"):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = code
return r.text
except:
return ""
# get方式請求數據 通過urllib記性實現GET請求
def getUrl(self,url,code='utf-8'):
url = quote(url,safe=string.printable)
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
with request.urlopen(req) as f:
print('Status:',f.status,f.reason)
return f.read().decode(code)
#s = StaticBase()
#print(s.getUrl('http://www.baidu.com/','utf-8'))
- 配置文件
{
"host": "http://shangyu.108sq.cn",
"base_url": "http://shangyu.108sq.cn/shuo/search?sertype=4",
"key_words": "污染",
"page_size": 30,
"search_key": "",
"last_request_time": 1562142204.149511,
"request_gap": 60
}
- 解析服務
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
from datetime import datetime
import json
from common.staticBase import StaticBase
class Shangyh108(StaticBase):
__config_file = "config.json" #配置文件
__text = "" #解析的網頁內容
__config_dict = {} #配置集合
__url = "" #請求的URL
__keywords = "" #關鍵字
__last_request_time = 0 #上次請求時間
# 構造函數
def __init__(self):
self.__config_dict = self.__getConfig()
# 查詢關鍵字
if len(self.__config_dict['search_key']) >0 :
self.__keywords = self.__config_dict['search_key']
else:
self.__keywords = self.__config_dict['key_words']
self.__url = self.__getUrl()
# 獲取網頁內容
def getText(self):
print(self.__url)
self.__text = StaticBase.getUrl(self,self.__url)
# 獲取第一頁內容
def getFirstPageText(self,url=''):
if self.checkRquestTime():
if len(url)==0 :
url = self.__getUrl()
self.__text = StaticBase.getUrl(self,url)
return self.parseHTML()
else:
print("操作頻繁請稍后重新")
# 獲取下一頁
def getNextPage(self,url):
url = self.__config_dict['host']+url
print(url)
self.__text = StaticBase.getUrl(self,url)
return self.parseHTML()
# 為防止請求對服務器造成太大壓力,控制請求的時間間隔,最少為5分鍾
def checkRquestTime(self):
request_gap = self.__config_dict['request_gap']
last_request_time = self.__config_dict['last_request_time']
dt_now = datetime.now().timestamp()
self.__last_request_time = last_request_time # 記錄上次請求的時間,為了獲取階段性的數據
if last_request_time == 0: #第一次請求,直接通過
last_request_time = dt_now
elif last_request_time+request_gap > dt_now:
print("請求過度頻繁,請稍后重試")
return False
else:
last_request_time = dt_now
self.__setConfig('last_request_time',last_request_time)
return True
# 獲取網頁描述信息
def getDesc(self):
self.getText()
soup = BeautifulSoup(self.__text,'html.parser')
obj_count = soup.select('.count')[0]
count_str = str(obj_count.string).replace("(共","").replace("條)","")
count = int(count_str)
pageSize = int(self.__config_dict['page_size'])
host = self.__config_dict['host']
if count % pageSize == 0 :
pages = count//pageSize
else:
pages = count // pageSize + 1
desc = {}
desc['host'] = host
desc['count'] = count
desc['page_size'] = pageSize
desc['total_page'] = pages
# 增加分頁的URL
if pages > 0 :
pageUrls = soup.select(".TCPage__middle > a")
page_url = []
for i in range(len(pageUrls)-1) :
tag = pageUrls[i+1]
page_url.append(tag['href'])
desc['page_url'] = page_url
return json.dumps(desc,ensure_ascii=False)
# 解析網頁內容
def parseHTML(self):
soup = BeautifulSoup(self.__text, 'html.parser')
list_li = soup.select('.TCSayList .TCSayList_li')
data_list = []
for i in range(len(list_li)):
item = {}
temp = list_li[i]
publish_time = temp.select('.TCSayList_li_time')[0]
int_dt = int(publish_time['data-time'])
if self.__last_request_time == 0 or self.__last_request_time < int_dt :
# 發布時間
item['publish_time_long'] = publish_time['data-time']
item['publish_time_str'] = datetime.fromtimestamp(int(publish_time['data-time'])).strftime(
'%Y-%m-%d %H:%M:%S')
# 數據標簽
item['data-tag'] = temp['data-tag']
# 用戶
author = temp.select('.TCSayList_li_author')[0]
item['author_name'] = author.string
item['author_url'] = author['href']
# 標題
if len(temp.select('.TCSayList__title a')) >0:
title = temp.select('.TCSayList__title a')[0]
item['title'] = title.string
item['link_url'] = title['href']
# 內容
item['content'] = temp.select('.TCSayList_li_content')[0]['data-short']
data_list.append(item)
return data_list
# 獲取請求配置信息
def __getConfig(self):
with open(self.__config_file, "r",encoding="utf-8") as load_f:
load_dict = json.load(load_f)
return load_dict
# 設置配置項
def __setConfig(self,key,value):
self.__config_dict[key] = value
print(self.__config_dict)
with open(self.__config_file,'w',encoding="utf-8") as f:
f.write(json.dumps(self.__config_dict,ensure_ascii=False))
def getKeywords(self):
self.__keywords = input("請輸入查詢的關鍵字,多個關鍵字用“+”連接,默認關鍵字:環保+污染+投訴,使用默認關鍵字可直接按Enter:")
if len(self.__keywords) == 0:
self.__keywords = self.__config_dict['key_words']
else:
self.__setConfig("search_key",self.__keywords)
# 獲取請求的URL
def __getUrl(self):
base_url = self.__config_dict['base_url']
# 組裝查詢參數
url = base_url + "&key=" + self.__keywords
return url
- REST服務端(目前服務沒有暴露很多,但相應服務實現都已經實現)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from flask import Flask,jsonify,abort,request
from changshuo108.shangyu.Shangyh108 import Shangyh108
app = Flask(__name__)
# 支持中文
app.config['JSON_AS_ASCII'] = False
shangyue = Shangyh108()
@app.route('/shangyu108/api/desc',methods=['GET'])
def get_desc():
return shangyue.getDesc()
@app.route('/shangyu108/api/first_page',methods=['GET'])
def get_firstPage():
return jsonify({'data':shangyue.getFirstPageText('')})
@app.route('/shangyu108/api/page',methods=['POST'])
def get_article():
if not request.json or not 'url' in request.json:
abort(400)
print(request.json['url'])
return jsonify({'data':shangyue.getNextPage(request.json['url'])})
if __name__ == '__main__':
app.run(debug=True)
公眾號鏈接:https://mp.weixin.qq.com/s?__biz=Mzg4MzI3MjM4NQ==&tempkey=MTAxNl9KVjVnVCtVNlo4RUpIZmZXbzBfSVR4dHU4YUhhX3hPNGMxVXdMd1JaQ21OZExlNnNybmJzaVhCT2hkZk85RzZKbzRlYWxFcEk1U2g5bmN4cWJ1QlNmNEdmWlBvVWxGTER2NDM5NjdWa1VIaDVWZlFyUF9EVmtYM0lmNnplRzRjanZsWEo4RUlESTg2YlFkVjBxdDFXbzEwR1UtVVpSd2V5U0R1YUVnfn4%3D&chksm=4f48bebe783f37a8622096cf8cb7d5dfbc5d913e3f1694ea601f51eec5aadddee66271739639#rd