一. urllib庫中將字典轉化為url的查詢參數
二.請求異常的處理,以及內部的判斷邏輯
1.返回的json數據為空:原因是requests的請求對象沒有加請求頭和cookies
import requests from urllib.parse import urlencode def get_page_index(): data = { "aid": "24", "app_name": "web_search", "offset": "0", "format": "json", "keyword": "街拍", "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) response = requests.get(url) if response.status_code == 200: print(response.text) if __name__ == '__main__': get_page_index()
# 結果:
{"count":0,"return_count":0,"query_id":"6537385837821170952","has_more":0,"request_id":"20190919170154010017090029827CF0A","search_id":"20190919170154010017090029827CF0A","cur_ts":1568883714,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","log_pb":{"impr_id":"20190919170154010017090029827CF0A"},"data":null,"data_head":[{"challenge_code":1366,"cell_type":71,"keyword":"街拍","url":"sslocal://search?keyword=%E8%A1%97%E6%8B%8D\u0026from=\u0026source=search_tab"}],"ab_fields":null,"latency":0,"search_type":2,"tab_rank":null}
2.正常獲得數據
import requests from urllib.parse import urlencode def get_page_index(): data = { "aid": "24", "app_name": "web_search", "offset": "0", "format": "json", "keyword": "街拍", "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) response = requests.get(url,headers=headers,cookies=cookies) if response.status_code == 200: print(response.content.decode("utf-8")) if __name__ == '__main__': headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"} cookies = {"Cookie": "tt_webid=6719272225969096196; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6719272225969096196; csrftoken=b28e41c77cd4f268af393de7d3e9d47a; UM_distinctid=16c4159a9ae7e3-04be696c185f6c-3f385c06-1fa400-16c4159a9afa94; CNZZDATA1259612802=1303724616-1564459685-https%253A%252F%252Fwww.toutiao.com%252F%7C1564459685; WIN_WH=1536_710; s_v_web_id=e588fb5c6570d79a16b67e84decce3d8; __tasessionId=y99fyeyyt1568882979794"} get_page_index() # 結果: {"count":20,"return_count":20,"query_id":"6537385837821170952","has_more":1,"request_id":"20190919170856010017031149086E0FC","search_id":"20190919170856010017031149086E0FC","cur_ts":1568884136,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","tokens":["街拍"],"log_pb":{"impr_id":"20190919170856010017031149086E0FC"},"data":[{"ala_src":"user","app_info":{"query_type":"AladdinRpcQueryType"},"cell_type。。。。。。。。。。。。省略
四:
圖片地址位置定位:要現請求這個網址,獲得相應解析出對應的imag_url
解析報錯:SyntaxError: Non-UTF-8 code starting with '\xe5',在程序上方添加 # -*- coding:utf-8 -*-
json中的鍵值對,期望用雙引號而不是單引號。原因是正則錯誤:
五:完整的代碼
# -*- coding:utf-8 -*- import re import requests from urllib.parse import urlencode import os from requests.exceptions import RequestException import json import pymongo from bs4 import BeautifulSoup from config import * from hashlib import md5 # 建立數據庫的鏈接對象 client = pymongo.MongoClient(MONGO_URL) # 數據庫的名稱 db = client[MONGO_DB] def get_page_index(offset, keyword): data = { "aid": "24", "app_name": "web_search", "offset": offset, "format": "json", "keyword": keyword, "autoload": "true", "count": "20", "en_qc": "1", "cur_tab": "1", "from": "search_tab", "pd": "synthesis", "timestamp": "1568883030289" } url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) # 知識點1:urlencode()將字典數據,{"a":"1","b":"2"}----> a=1,b=2 try: response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content.decode() return content return None except RequestException: # 知識點2:所有請求異常類的捕獲 print("請求出錯") return None def parse_page_index(html): """構造生成器即可,或者這個函數的返回值是一個列表""" data = json.loads(html) if data and "data" in data.keys(): for item in data.get("data"): # 知識點3:字典獲取鍵的值的get方法 if "article_url" in item.keys(): url = item.get("article_url") yield url def get_page_detial(url): try: # 知識點4:請求的異常處理方式 response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content.decode() return content return None except RequestException: print("請求出錯") return None def parse_page_detial(html, url): """正則獲取gallery""" soup = BeautifulSoup(html, "lxml") title = soup.select("title")[0].get_text() # 知識點5:soup的選擇器使用 images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S) # 知識點6:正則模式re.S模式 result = re.search(images_pattern, html) if result: ret = result.group(1) # {\"count\":11,\"sub_images\":[{\"url\":\"http:\\\u002F\\\u002Fp3.pstatp.com\\...} # 在進行loads轉換時,報錯json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 因此需要替換\為空字符串 ret = ret.replace("\\", "") ret = ret.replace("u002F", "/") data = json.loads(ret) if data and 'sub_images' in data.keys(): sub_images = data.get("sub_images") images = [item.get("url") for item in sub_images] for img in images: download(img) return { "title": title, "images": images, "url": url } def save_to_mongo(ret_dict): if db[MONGO_TABLE].insert(ret_dict): # 知識點8:mongodb數據庫的鏈接,配置文件方式傳入 print("插入數據到數據庫成功", ret_dict["title"]) return True return False def download(url): print("正在下載圖片",url) try: response = requests.get(url, headers=headers, cookies=cookies) if response.status_code == 200: content = response.content saveimg(content) return None except RequestException: print("請求出錯") return None def saveimg(content): file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg") # 知識點9:運用md5進行去重,md5的簡單回顧 if not os.path.exists(file_path): # 知識點10:os方法的使用 with open(file_path,"wb") as f: f.write(content) def main(): for offset in range(START_PAGE,END_PAGE,20): keyword = "街拍" html = get_page_index(offset, keyword) if html: for url in parse_page_index(html): html = get_page_detial(url) if html: ret = parse_page_detial(html, url) if ret: save_to_mongo(ret) if __name__ == '__main__': headers = { "User-Agent": "xx"} cookies = { "Cookie": "xx"} main()
試運行爬取所有的街拍:報錯json.decoder.JSONDecodeError,因此代碼還得進行優化,排除異常。
六:知識點總結
urlencode是從urllib.parse中的一個方法:將字典變成url的查詢參數
from urllib.parse import urlencode data = {"a":1,"b":2} url = "http:www.baidu.com/?" print(url + urlencode(data))
http:www.baidu.com/?a=1&b=2
md5加密的不一致問題
一直以來都是用 hashlib中的md5進行加密,md5.update(二進制) md5.hexdigest(),可以會出現對相同的字符串進行加密,加密結果不一樣的問題,看來是update方法造成的。
from hashlib import md5 fp = md5() demo = ["1","1","3","3"] for i in demo: fp.update(i.encode("utf-8")) print(fp.hexdigest()) # 結果: c4ca4238a0b923820dcc509a6f75849b 6512bd43d9caa6e02c990b0a82652dca 73278a4a86960eeb576a8fd4c9ec6997 fd06b8ea02fe5b1c2496fe1700e9d16c
# 原因是md5.updage()會將上次的串和這次的進行拼接,1,11,113,1133,每次加密的串都不同,結果肯定不同。
所以每加密之前,都對md5進行實例化,才能保證相同內容加密結果一樣,因為以前這個方法都是放在函數里面的,每次調用函數,都會重新實例化md5,因此不存在問題。循環就存在問題
上面代碼可以改為
from hashlib import md5
demo = ["1","1","3","3"]
for i in demo:
fp = md5()
fp.update(i.encode("utf-8"))
print(fp.hexdigest())
# 結果為:
c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3
for i in demo: print(md5(i.encode("utf-8")).hexdigest()) # 這種方式行,因為每次都重新實例化了 # 結果 c4ca4238a0b923820dcc509a6f75849b c4ca4238a0b923820dcc509a6f75849b eccbc87e4b5ce2fe28308fd9f2a7baf3 eccbc87e4b5ce2fe28308fd9f2a7baf3
# 看源碼也沒有理解update真正意圖,只是說用字符串更新對象。 后續解決
os模塊的使用方法
os的基本用法 1. os.getcwd():查看當前所在路徑。 current_path = os.getcwd() print(current_path) # 運行結果 C:\Users\Administrator\AppData\Roaming\Sublime Text 3\Packages\User 2. os.listdir(path):列舉目錄下的所有文件。返回的是列表類型。 dir_list = os.listdir(current) print(dir_list) # 運行結果 ['11.py', 'cuiqingcai.py', 'Localization.sublime-settings', 'oscrypto-ca-bundle.crt', 'Package Control.cache', 'Package Control.last-run', 'Package Control.merged-ca-bundle', 'Package Control.sublime-settings', 'Package Control.user-ca-bundle', 'Preferences.sublime-settings', 'reids分布式鎖', 'sha1.py', 'test.py', 'untitled.sublime-build']
具體用法見:https://www.cnblogs.com/yufeihlf/p/6179547.html
Mongo數據庫與python的交互
import pyongo # 交互模塊 # 第一步,建立客戶端,鏈接mogo服務器,ip和port from pymongo import MongoClient client = MongoClient(host,port) collection = client[db名][集合名] # db名--相當於數據庫的名稱 集合名---相當於表名稱 # 第二步,添加數據 ret = collection.insert_one({"name":"test10010","age":33}) print(ret) # 通過返回的數據進行判斷
if ret:
xxxx
示例:
import pymongo
client = pymongo.MongoClient("localhost")
# 鏈接指定數據庫中的指定集合,不存在就新建
collection = client["test"]["new"]
ret = collection.insert({"new":"python"})
print(ret)
# 結果:
5d85ce978a808f42364b045c
插入前:

插入后:

正則表達式知識點回顧:
import re pattern = re.compile("匹配規則", re.S) re.compile() 返回的就是一個匹配規則。陪着search find match等方法使用 import re a = """aaaaaaabbbbbbbb 111111ccccc""" pattern1 = re.compile("aaaaaaa(.*?)cccc") print(re.search(pattern1,a)) # None re.S可以匹配全部文本,不擔心換行問題 pattern2 = pattern1 = re.compile("aaaaaaa(.*?)cccc",re.S) print(re.search(pattern2,a)) # <re.Match object; span=(0, 26), match='aaaaaaabbbbbbbb\n111111cccc'>