整理了一下網易雲歌曲評論抓取、分析好友信息抓取、淘寶寶貝抓取、今日頭條美圖抓取的一些代碼
抓取網易雲評論
進入歌曲界面:
找到如下的數據源:

貼一段Lyrichu的代碼:
(運行環境為P2.7)
# -*- coding: utf-8 -*-
# @Time : 2017/3/28 8:46
# @Author : Lyrichu
# @Email : 919987476@qq.com
# @File : NetCloud_spider3.py
'''
@Description:
網易雲音樂評論爬蟲,可以完整爬取整個評論
部分參考了@平胸小仙女的文章(地址:https://www.zhihu.com/question/36081767)
post加密部分也給出了,可以參考原帖:
作者:平胸小仙女
鏈接:https://www.zhihu.com/question/36081767/answer/140287795
來源:知乎
'''
from Crypto.Cipher import AES
import base64
import requests
import json
import codecs
import time
# 頭部信息
headers = {
'Host':"music.163.com",
'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding':"gzip, deflate",
'Content-Type':"application/x-www-form-urlencoded",
'Cookie':"_ntes_nnid=754361b04b121e078dee797cdb30e0fd,1486026808627; _ntes_nuid=754361b04b121e078dee797cdb30e0fd; JSESSIONID-WYYY=yfqt9ofhY%5CIYNkXW71TqY5OtSZyjE%2FoswGgtl4dMv3Oa7%5CQ50T%2FVaee%2FMSsCifHE0TGtRMYhSPpr20i%5CRO%2BO%2B9pbbJnrUvGzkibhNqw3Tlgn%5Coil%2FrW7zFZZWSA3K9gD77MPSVH6fnv5hIT8ms70MNB3CxK5r3ecj3tFMlWFbFOZmGw%5C%3A1490677541180; _iuqxldmzr_=32; vjuids=c8ca7976.15a029d006a.0.51373751e63af8; vjlast=1486102528.1490172479.21; __gads=ID=a9eed5e3cae4d252:T=1486102537:S=ALNI_Mb5XX2vlkjsiU5cIy91-ToUDoFxIw; vinfo_n_f_l_n3=411a2def7f75a62e.1.1.1486349441669.1486349607905.1490173828142; P_INFO=m15527594439@163.com|1489375076|1|study|00&99|null&null&null#hub&420100#10#0#0|155439&1|study_client|15527594439@163.com; NTES_CMT_USER_INFO=84794134%7Cm155****4439%7Chttps%3A%2F%2Fsimg.ws.126.net%2Fe%2Fimg5.cache.netease.com%2Ftie%2Fimages%2Fyun%2Fphoto_default_62.png.39x39.100.jpg%7Cfalse%7CbTE1NTI3NTk0NDM5QDE2My5jb20%3D; usertrack=c+5+hljHgU0T1FDmA66MAg==; Province=027; City=027; _ga=GA1.2.1549851014.1489469781; __utma=94650624.1549851014.1489469781.1490664577.1490672820.8; __utmc=94650624; __utmz=94650624.1490661822.6.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; playerid=81568911; __utmb=94650624.23.10.1490672820",
'Connection':"keep-alive",
'Referer':'http://music.163.com/'
}
# 設置代理服務器
proxies= {
'http:':'http://121.232.146.184',
'https:':'https://144.255.48.197'
}
# offset的取值為:(評論頁數-1)*20,total第一頁為true,其余頁為false
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' # 第一個參數
second_param = "010001" # 第二個參數
# 第三個參數
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 第四個參數
forth_param = "0CoJUm6Qyw8W8jud"
# 獲取參數
def get_params(page): # page為傳入頁數
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1): # 如果為第一頁
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText
# 獲取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
# 解密過程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text
# 獲得評論json數據
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data,proxies = proxies)
return response.content
# 抓取熱門評論,返回熱評列表
def get_hot_comments(url):
hot_comments_list = []
hot_comments_list.append(u"用戶ID 用戶昵稱 用戶頭像地址 評論時間 點贊總數 評論內容\n")
params = get_params(1) # 第一頁
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
hot_comments = json_dict['hotComments'] # 熱門評論
print("共有%d條熱門評論!" % len(hot_comments))
for item in hot_comments:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userID'] # 評論者id
nickname = item['user']['nickname'] # 昵稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = userID + " " + nickname + " " + avatarUrl + " " + comment_time + " " + likedCount + " " + comment + u"\n"
hot_comments_list.append(comment_info)
return hot_comments_list
# 抓取某一首歌的全部評論
def get_all_comments(url):
all_comments_list = [] # 存放所有評論
all_comments_list.append(u"用戶ID 用戶昵稱 用戶頭像地址 評論時間 點贊總數 評論內容\n") # 頭部信息
params = get_params(1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
comments_num = int(json_dict['total'])
if(comments_num % 20 == 0):
page = comments_num / 20
else:
page = int(comments_num / 20) + 1
print("共有%d頁評論!" % page)
for i in range(page): # 逐頁抓取
params = get_params(i+1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
if i == 0:
print("共有%d條評論!" % comments_num) # 全部評論總數
for item in json_dict['comments']:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userId'] # 評論者id
nickname = item['user']['nickname'] # 昵稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = unicode(userID) + u" " + nickname + u" " + avatarUrl + u" " + unicode(comment_time) + u" " + unicode(likedCount) + u" " + comment + u"\n"
all_comments_list.append(comment_info)
print("第%d頁抓取完畢!" % (i+1))
return all_comments_list
# 將評論寫入文本文件
def save_to_file(list,filename):
with codecs.open(filename,'a',encoding='utf-8') as f:
f.writelines(list)
print("寫入文件成功!")
if __name__ == "__main__":
start_time = time.time() # 開始時間
url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_453185824?csrf_token="
filename = u"On_My_Way.txt"
all_comments_list = get_all_comments(url)
save_to_file(all_comments_list,filename)
end_time = time.time() #結束時間
print("程序耗時%f秒." % (end_time - start_time))
其中AES需要安裝pycrypto庫,在安裝時報錯,點擊more,可以找到需要安裝的C類庫即可(直接復制相應的網址,下載並安裝即可,好像是VCForPython27.msi)
結果如下:


代碼文件:

******************************♣******************************
抓取微信好友信息
代碼如下:
#!/usr/bin/env python
# encoding=utf-8
from __future__ import print_function
import os
import requests
import re
import time
import xml.dom.minidom
import json
import sys
import math
import subprocess
import ssl
import threading
import urllib, urllib2
DEBUG = False
MAX_GROUP_NUM = 2 # 每組人數
INTERFACE_CALLING_INTERVAL = 5 # 接口調用時間間隔, 間隔太短容易出現"操作太頻繁", 會被限制操作半小時左右
MAX_PROGRESS_LEN = 50
QRImagePath = os.path.join(os.getcwd(), 'qrcode.jpg')
tip = 0
uuid = ''
base_uri = ''
redirect_uri = ''
push_uri = ''
skey = ''
wxsid = ''
wxuin = ''
pass_ticket = ''
deviceId = 'e000000000000000'
BaseRequest = {}
ContactList = []
My = []
SyncKey = []
try:
xrange
range = xrange
except:
# python 3
pass
def responseState(func, BaseResponse):
ErrMsg = BaseResponse['ErrMsg']
Ret = BaseResponse['Ret']
if DEBUG or Ret != 0:
print('func: %s, Ret: %d, ErrMsg: %s' % (func, Ret, ErrMsg))
if Ret != 0:
return False
return True
def getUUID():
global uuid
url = 'https://login.weixin.qq.com/jslogin'
params = {
'appid': 'wx782c26e4c19acffb',
'fun': 'new',
'lang': 'zh_CN',
'_': int(time.time()),
}
r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg==";
regx = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'
pm = re.search(regx, data)
code = pm.group(1)
uuid = pm.group(2)
if code == '200':
return True
return False
def showQRImage():
global tip
url = 'https://login.weixin.qq.com/qrcode/' + uuid
params = {
't': 'webwx',
'_': int(time.time()),
}
r = myRequests.get(url=url, params=params)
tip = 1
f = open(QRImagePath, 'wb+')
f.write(r.content)
f.close()
time.sleep(1)
if sys.platform.find('darwin') >= 0:
subprocess.call(['open', QRImagePath])
else:
os.startfile(QRImagePath)
print('請使用微信掃描二維碼以登錄')
def waitForLogin():
global tip, base_uri, redirect_uri, push_uri
url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
tip, uuid, int(time.time()))
r = myRequests.get(url=url)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.code=500;
regx = r'window.code=(\d+);'
pm = re.search(regx, data)
code = pm.group(1)
if code == '201': # 已掃描
print('成功掃描,請在手機上點擊確認以登錄')
tip = 0
elif code == '200': # 已登錄
print('正在登錄...')
regx = r'window.redirect_uri="(\S+?)";'
pm = re.search(regx, data)
redirect_uri = pm.group(1) + '&fun=new'
base_uri = redirect_uri[:redirect_uri.rfind('/')]
# push_uri與base_uri對應關系(排名分先后)(就是這么奇葩..)
services = [
('wx2.qq.com', 'webpush2.weixin.qq.com'),
('qq.com', 'webpush.weixin.qq.com'),
('web1.wechat.com', 'webpush1.wechat.com'),
('web2.wechat.com', 'webpush2.wechat.com'),
('wechat.com', 'webpush.wechat.com'),
('web1.wechatapp.com', 'webpush1.wechatapp.com'),
]
push_uri = base_uri
for (searchUrl, pushUrl) in services:
if base_uri.find(searchUrl) >= 0:
push_uri = 'https://%s/cgi-bin/mmwebwx-bin' % pushUrl
break
# closeQRImage
if sys.platform.find('darwin') >= 0: # for OSX with Preview
os.system("osascript -e 'quit app \"Preview\"'")
elif code == '408': # 超時
pass
# elif code == '400' or code == '500':
return code
def login():
global skey, wxsid, wxuin, pass_ticket, BaseRequest
r = myRequests.get(url=redirect_uri)
r.encoding = 'utf-8'
data = r.text
# print(data)
doc = xml.dom.minidom.parseString(data)
root = doc.documentElement
for node in root.childNodes:
if node.nodeName == 'skey':
skey = node.childNodes[0].data
elif node.nodeName == 'wxsid':
wxsid = node.childNodes[0].data
elif node.nodeName == 'wxuin':
wxuin = node.childNodes[0].data
elif node.nodeName == 'pass_ticket':
pass_ticket = node.childNodes[0].data
# print('skey: %s, wxsid: %s, wxuin: %s, pass_ticket: %s' % (skey, wxsid,
# wxuin, pass_ticket))
if not all((skey, wxsid, wxuin, pass_ticket)):
return False
BaseRequest = {
'Uin': int(wxuin),
'Sid': wxsid,
'Skey': skey,
'DeviceID': deviceId,
}
return True
def webwxinit():
url = (base_uri +
'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
params = {'BaseRequest': BaseRequest}
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, data=json.dumps(params), headers=headers)
r.encoding = 'utf-8'
data = r.json()
if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxinit.json'), 'wb')
f.write(r.content)
f.close()
# print(data)
global ContactList, My, SyncKey
dic = data
ContactList = dic['ContactList']
My = dic['User']
SyncKey = dic['SyncKey']
state = responseState('webwxinit', dic['BaseResponse'])
return state
def webwxgetcontact():
url = (base_uri +
'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, headers=headers)
r.encoding = 'utf-8'
data = r.json()
if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxgetcontact.json'), 'wb')
f.write(r.content)
f.close()
dic = data
MemberList = dic['MemberList']
# 倒序遍歷,不然刪除的時候出問題..
SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync",
"floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp", "facebookapp",
"masssendapp",
"meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder", "weixinreminder",
"wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts", "notification_messages", "wxitil",
"userexperience_alarm"]
for i in range(len(MemberList) - 1, -1, -1):
Member = MemberList[i]
if Member['VerifyFlag'] & 8 != 0: # 公眾號/服務號
MemberList.remove(Member)
elif Member['UserName'] in SpecialUsers: # 特殊賬號
MemberList.remove(Member)
elif Member['UserName'].find('@@') != -1: # 群聊
MemberList.remove(Member)
elif Member['UserName'] == My['UserName']: # 自己
MemberList.remove(Member)
return MemberList
def syncKey():
SyncKeyItems = ['%s_%s' % (item['Key'], item['Val'])
for item in SyncKey['List']]
SyncKeyStr = '|'.join(SyncKeyItems)
return SyncKeyStr
def syncCheck():
url = push_uri + '/synccheck?'
params = {
'skey': BaseRequest['Skey'],
'sid': BaseRequest['Sid'],
'uin': BaseRequest['Uin'],
'deviceId': BaseRequest['DeviceID'],
'synckey': syncKey(),
'r': int(time.time()),
}
r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text
# print(data)
# window.synccheck={retcode:"0",selector:"2"}
regx = r'window.synccheck={retcode:"(\d+)",selector:"(\d+)"}'
pm = re.search(regx, data)
retcode = pm.group(1)
selector = pm.group(2)
return selector
def webwxsync():
global SyncKey
url = base_uri + '/webwxsync?lang=zh_CN&skey=%s&sid=%s&pass_ticket=%s' % (
BaseRequest['Skey'], BaseRequest['Sid'], urllib.quote_plus(pass_ticket))
params = {
'BaseRequest': BaseRequest,
'SyncKey': SyncKey,
'rr': ~int(time.time()),
}
headers = {'content-type': 'application/json; charset=UTF-8'}
r = myRequests.post(url=url, data=json.dumps(params))
r.encoding = 'utf-8'
data = r.json()
# print(data)
dic = data
SyncKey = dic['SyncKey']
state = responseState('webwxsync', dic['BaseResponse'])
return state
def heartBeatLoop():
while True:
selector = syncCheck()
if selector != '0':
webwxsync()
time.sleep(1)
def main():
global myRequests
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'}
myRequests = requests.Session()
myRequests.headers.update(headers)
if not getUUID():
print('獲取uuid失敗')
return
print('正在獲取二維碼圖片...')
showQRImage()
while waitForLogin() != '200':
pass
os.remove(QRImagePath)
if not login():
print('登錄失敗')
return
if not webwxinit():
print('初始化失敗')
return
MemberList = webwxgetcontact()
threading.Thread(target=heartBeatLoop)
MemberCount = len(MemberList)
print('通訊錄共%s位好友' % MemberCount)
d = {}
imageIndex = 0
for Member in MemberList:
imageIndex = imageIndex + 1
# name = 'C:\\Users\\Public\\Pictures\\' + str(imageIndex) + '.jpg'
# imageUrl = 'http://wx2.qq.com' + Member['HeadImgUrl']
# r = myRequests.get(url=imageUrl, headers=headers)
# imageContent = (r.content)
# fileImage = open(name, 'wb')
# fileImage.write(imageContent)
# fileImage.close()
# print('正在下載第:' + str(imageIndex) + '位好友頭像')
d[Member['UserName']] = (Member['NickName'], Member['RemarkName'])
city = Member['City']
city = 'nocity' if city == '' else city
name = Member['NickName']
name = 'noname' if name == '' else name
sign = Member['Signature']
sign = 'nosign' if sign == '' else sign
remark = Member['RemarkName']
remark = 'noremark' if remark == '' else remark
alias = Member['Alias']
alias = 'noalias' if alias == '' else alias
nick = Member['NickName']
nick = 'nonick' if nick == '' else nick
print(name, '|||', city, '|||', Member['Sex'], '|||', Member['StarFriend'], '|||', sign,
'|||', remark, '|||', alias, '|||', nick)
if __name__ == '__main__':
main()
print('回車鍵退出...')
input()
程序運行過程中會跳出二維碼,需要我們掃描登錄
作者原文基於mac,所以我自己修改成了這個樣子(紅色加粗和藍色底紋部分)
subprocess.call(['open', QRImagePath]) 是給linux或mac下用來打開文件的
而windows下要用os.startfile(QRImagePath)(不要問我怎么知道的,我運行報錯后猜出它的作用然后百度的)

(感謝好友的一路陪伴和困厄之時的支持)
微信頭像被保存在對應的文件路徑C:\\Users\\Public\\Pictures\\中
在CSV中:

經過分析(借助了EasyChart的配色):

神奇,居然還是女生多?難道是最近加我的微信或者騙子比較多嘛?
文件
參考:
Python對微信好友進行簡單統計分析當Python遇上微信,可以這么玩
******************************♣******************************
抓取淘寶寶貝信息
代碼如下:
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from taobao.config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
wait = WebDriverWait(browser, 10)
#設置窗口大小,以免默認的相對較小的窗口影響操作
browser.set_window_size(1400, 900)
def search():
print('正在搜索')
try:
browser.get('https://www.taobao.com')
# http://selenium-python.readthedocs.io/waits.html#explicit-waits
# 輸入框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
#搜索提交按鈕
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.send_keys(KEYWORD)
submit.click()
#等待搜索內容加載,表示為共x頁
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
#total中就是總共有多少頁
get_products()
return total.text
except TimeoutException:
# wait在網速過慢的時候會出現超時錯誤,所以我們遞歸調用重新請求
return search()
def next_page(page_number):
#進入下一頁有多種方式,比如點擊xx頁,比如點擊下一頁按鈕
#比如在輸入框中設置頁碼,點擊確定
#我們采用最后一種,因為第一章容易錯亂
#而最后一種相比於第二種方便我們在出錯的時候遞歸
print('正在翻頁', page_number)
try:
#輸入頁碼框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
#提交按鈕
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
#清楚原先的頁碼
input.clear()
input.send_keys(page_number)
submit.click()
#判斷翻頁是否成功(判斷條件是高亮的當前代碼)
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
get_products()
except TimeoutException:
#出錯則遞歸,重新執行
next_page(page_number)
#解析
def get_products():
#判斷所有的寶貝信息是否傳遞成功
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
print(product)
save_to_mongo(product)
#保存
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存儲到MONGODB成功', result)
except Exception:
print('存儲到MONGODB失敗', result)
def main():
try:
total = search()
#提取總頁數
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total + 1):
#從第二頁才開始需要點擊下一頁
next_page(i)
except Exception:
print('出錯啦')
finally:
browser.close()
if __name__ == '__main__':
main()
注意,是在Python3下
config文件(上面標紅的config的引入路徑要根據實際情況修改)
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'
# http://phantomjs.org/api/command-line.html
#第一個參數是不加載圖片,使程序運行更快
#第二個的開啟緩存
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
KEYWORD = '漢服'
******************************♣******************************
抓取今日頭條
代碼如下:
import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from spider_basic.config import *
#
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]
#獲取主頁,offse是ajax動態加載的偏移量,keyword是搜索的關鍵字
def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
#下載
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None
#保存
def save_image(content):
# 文件名
# 路徑 名稱 后綴
# 名稱md5 這是為了當我們運行一次程序出錯時,第二次不再保存重復圖片
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'pic',md5(content).hexdigest(), 'jpg')
print(file_path)
# 如果文件不存在則保存
# content是二進制形式的網頁內容
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()
#解析搜索后返回的網頁(就是圖集XHR中的那些)
def parse_page_index(text):
try:
#轉化為JSON
data = json.loads(text)
#如果JSON數據中含有data這個鍵
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass
#獲取詳情頁
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
#解析詳情頁
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
#獲取標題
result = soup.select('title')
title = result[0].get_text() if result else ''
#獲取圖片地址(定義正則規則,並search)
images_pattern = re.compile('var gallery = (.*?);', re.S)
result = re.search(images_pattern, html)
#如果非空
if result:
#轉化為JSON格式
data = json.loads(result.group(1))
#如果sub_images這個鍵的值對應了各種url
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
#下載圖片
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
}
#存儲到數據庫
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False
def main(offset):
text = get_page_index(offset, KEYWORD)
#生成器返回圖集中每一個圖集的URL的遍歷器
urls = parse_page_index(text)
#遍歷每一個圖集,獲取詳情頁信息
for url in urls:
#獲取詳情頁
html = get_page_detail(url)
#解析詳情頁
result = parse_page_detail(html, url)
#保存到MongoDB
if result: save_to_mongo(result)
# if __name__ == '__main__':
# main(60)
# 注意,如果不在if __name__ == '__main__':中運行的話,會爆出一堆多線程的錯誤
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
config文件
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
GROUP_START = 1
GROUP_END = 20
KEYWORD='萌寵'
附件列表