Python的幾個爬蟲代碼整理(網易雲、微信、淘寶、今日頭條)


整理了一下網易雲歌曲評論抓取、分析好友信息抓取、淘寶寶貝抓取、今日頭條美圖抓取的一些代碼
抓取網易雲評論
進入歌曲界面:
找到如下的數據源:
貼一段Lyrichu的代碼:
(運行環境為P2.7)
# -*- coding: utf-8 -*-
# @Time : 2017/3/28 8:46
# @Author : Lyrichu
# @Email : 919987476@qq.com
# @File : NetCloud_spider3.py
'''
@Description:
網易雲音樂評論爬蟲,可以完整爬取整個評論
部分參考了@平胸小仙女的文章(地址:https://www.zhihu.com/question/36081767)
post加密部分也給出了,可以參考原帖:
作者:平胸小仙女
鏈接:https://www.zhihu.com/question/36081767/answer/140287795
來源:知乎
'''
from Crypto.Cipher import AES
import base64
import requests
import json
import codecs
import time

# 頭部信息
headers = {
'Host':"music.163.com",
'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding':"gzip, deflate",
'Content-Type':"application/x-www-form-urlencoded",
'Cookie':"_ntes_nnid=754361b04b121e078dee797cdb30e0fd,1486026808627; _ntes_nuid=754361b04b121e078dee797cdb30e0fd; JSESSIONID-WYYY=yfqt9ofhY%5CIYNkXW71TqY5OtSZyjE%2FoswGgtl4dMv3Oa7%5CQ50T%2FVaee%2FMSsCifHE0TGtRMYhSPpr20i%5CRO%2BO%2B9pbbJnrUvGzkibhNqw3Tlgn%5Coil%2FrW7zFZZWSA3K9gD77MPSVH6fnv5hIT8ms70MNB3CxK5r3ecj3tFMlWFbFOZmGw%5C%3A1490677541180; _iuqxldmzr_=32; vjuids=c8ca7976.15a029d006a.0.51373751e63af8; vjlast=1486102528.1490172479.21; __gads=ID=a9eed5e3cae4d252:T=1486102537:S=ALNI_Mb5XX2vlkjsiU5cIy91-ToUDoFxIw; vinfo_n_f_l_n3=411a2def7f75a62e.1.1.1486349441669.1486349607905.1490173828142; P_INFO=m15527594439@163.com|1489375076|1|study|00&99|null&null&null#hub&420100#10#0#0|155439&1|study_client|15527594439@163.com; NTES_CMT_USER_INFO=84794134%7Cm155****4439%7Chttps%3A%2F%2Fsimg.ws.126.net%2Fe%2Fimg5.cache.netease.com%2Ftie%2Fimages%2Fyun%2Fphoto_default_62.png.39x39.100.jpg%7Cfalse%7CbTE1NTI3NTk0NDM5QDE2My5jb20%3D; usertrack=c+5+hljHgU0T1FDmA66MAg==; Province=027; City=027; _ga=GA1.2.1549851014.1489469781; __utma=94650624.1549851014.1489469781.1490664577.1490672820.8; __utmc=94650624; __utmz=94650624.1490661822.6.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; playerid=81568911; __utmb=94650624.23.10.1490672820",
'Connection':"keep-alive",
'Referer':'http://music.163.com/'
}
# 設置代理服務器
proxies= {
'http:':'http://121.232.146.184',
'https:':'https://144.255.48.197'
}

# offset的取值為:(評論頁數-1)*20,total第一頁為true,其余頁為false
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' # 第一個參數
second_param = "010001" # 第二個參數
# 第三個參數
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 第四個參數
forth_param = "0CoJUm6Qyw8W8jud"

# 獲取參數
def get_params(page): # page為傳入頁數
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
if(page == 1): # 如果為第一頁
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
h_encText = AES_encrypt(first_param, first_key, iv)
else:
offset = str((page-1)*20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
h_encText = AES_encrypt(first_param, first_key, iv)
h_encText = AES_encrypt(h_encText, second_key, iv)
return h_encText

# 獲取 encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey


# 解密過程
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text

# 獲得評論json數據
def get_json(url, params, encSecKey):
data = {
"params": params,
"encSecKey": encSecKey
}
response = requests.post(url, headers=headers, data=data,proxies = proxies)
return response.content

# 抓取熱門評論,返回熱評列表
def get_hot_comments(url):
hot_comments_list = []
hot_comments_list.append(u"用戶ID 用戶昵稱 用戶頭像地址 評論時間 點贊總數 評論內容\n")
params = get_params(1) # 第一頁
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
hot_comments = json_dict['hotComments'] # 熱門評論
print("共有%d條熱門評論!" % len(hot_comments))
for item in hot_comments:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userID'] # 評論者id
nickname = item['user']['nickname'] # 昵稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = userID + " " + nickname + " " + avatarUrl + " " + comment_time + " " + likedCount + " " + comment + u"\n"
hot_comments_list.append(comment_info)
return hot_comments_list

# 抓取某一首歌的全部評論
def get_all_comments(url):
all_comments_list = [] # 存放所有評論
all_comments_list.append(u"用戶ID 用戶昵稱 用戶頭像地址 評論時間 點贊總數 評論內容\n") # 頭部信息
params = get_params(1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
comments_num = int(json_dict['total'])
if(comments_num % 20 == 0):
page = comments_num / 20
else:
page = int(comments_num / 20) + 1
print("共有%d頁評論!" % page)
for i in range(page): # 逐頁抓取
params = get_params(i+1)
encSecKey = get_encSecKey()
json_text = get_json(url,params,encSecKey)
json_dict = json.loads(json_text)
if i == 0:
print("共有%d條評論!" % comments_num) # 全部評論總數
for item in json_dict['comments']:
comment = item['content'] # 評論內容
likedCount = item['likedCount'] # 點贊總數
comment_time = item['time'] # 評論時間(時間戳)
userID = item['user']['userId'] # 評論者id
nickname = item['user']['nickname'] # 昵稱
avatarUrl = item['user']['avatarUrl'] # 頭像地址
comment_info = unicode(userID) + u" " + nickname + u" " + avatarUrl + u" " + unicode(comment_time) + u" " + unicode(likedCount) + u" " + comment + u"\n"
all_comments_list.append(comment_info)
print("%d頁抓取完畢!" % (i+1))
return all_comments_list


# 將評論寫入文本文件
def save_to_file(list,filename):
with codecs.open(filename,'a',encoding='utf-8') as f:
f.writelines(list)
print("寫入文件成功!")

if __name__ == "__main__":
start_time = time.time() # 開始時間
url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_453185824?csrf_token="
filename = u"On_My_Way.txt"
all_comments_list = get_all_comments(url)
save_to_file(all_comments_list,filename)
end_time = time.time() #結束時間
print("程序耗時%f." % (end_time - start_time))
其中AES需要安裝pycrypto庫,在安裝時報錯,點擊more,可以找到需要安裝的C類庫即可(直接復制相應的網址,下載並安裝即可,好像是VCForPython27.msi)
結果如下:
代碼文件:


參考:


******************************♣******************************
抓取微信好友信息
代碼如下:
#!/usr/bin/env python
# encoding=utf-8
from __future__ import print_function

import os
import requests
import re
import time
import xml.dom.minidom
import json
import sys
import math
import subprocess
import ssl
import threading
import urllib, urllib2

DEBUG = False

MAX_GROUP_NUM = 2 # 每組人數
INTERFACE_CALLING_INTERVAL = 5 # 接口調用時間間隔, 間隔太短容易出現"操作太頻繁", 會被限制操作半小時左右
MAX_PROGRESS_LEN = 50

QRImagePath = os.path.join(os.getcwd(), 'qrcode.jpg')

tip = 0
uuid = ''

base_uri = ''
redirect_uri = ''
push_uri = ''

skey = ''
wxsid = ''
wxuin = ''
pass_ticket = ''
deviceId = 'e000000000000000'

BaseRequest = {}

ContactList = []
My = []
SyncKey = []

try:
xrange
range = xrange
except:
# python 3
pass


def responseState(func, BaseResponse):
ErrMsg = BaseResponse['ErrMsg']
Ret = BaseResponse['Ret']
if DEBUG or Ret != 0:
print('func: %s, Ret: %d, ErrMsg: %s' % (func, Ret, ErrMsg))

if Ret != 0:
return False

return True


def getUUID():
global uuid

url = 'https://login.weixin.qq.com/jslogin'
params = {
'appid': 'wx782c26e4c19acffb',
'fun': 'new',
'lang': 'zh_CN',
'_': int(time.time()),
}

r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text

# print(data)

# window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg==";
regx = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"'
pm = re.search(regx, data)

code = pm.group(1)
uuid = pm.group(2)

if code == '200':
return True

return False


def showQRImage():
global tip

url = 'https://login.weixin.qq.com/qrcode/' + uuid
params = {
't': 'webwx',
'_': int(time.time()),
}

r = myRequests.get(url=url, params=params)
tip = 1

f = open(QRImagePath, 'wb+')
f.write(r.content)
f.close()
time.sleep(1)

if sys.platform.find('darwin') >= 0:
subprocess.call(['open', QRImagePath])
else:
os.startfile(QRImagePath)

print('請使用微信掃描二維碼以登錄')


def waitForLogin():
global tip, base_uri, redirect_uri, push_uri

url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
tip, uuid, int(time.time()))

r = myRequests.get(url=url)
r.encoding = 'utf-8'
data = r.text

# print(data)

# window.code=500;
regx = r'window.code=(\d+);'
pm = re.search(regx, data)

code = pm.group(1)

if code == '201': # 已掃描
print('成功掃描,請在手機上點擊確認以登錄')
tip = 0
elif code == '200': # 已登錄
print('正在登錄...')
regx = r'window.redirect_uri="(\S+?)";'
pm = re.search(regx, data)
redirect_uri = pm.group(1) + '&fun=new'
base_uri = redirect_uri[:redirect_uri.rfind('/')]

# push_uribase_uri對應關系(排名分先后)(就是這么奇葩..)
services = [
('wx2.qq.com', 'webpush2.weixin.qq.com'),
('qq.com', 'webpush.weixin.qq.com'),
('web1.wechat.com', 'webpush1.wechat.com'),
('web2.wechat.com', 'webpush2.wechat.com'),
('wechat.com', 'webpush.wechat.com'),
('web1.wechatapp.com', 'webpush1.wechatapp.com'),
]
push_uri = base_uri
for (searchUrl, pushUrl) in services:
if base_uri.find(searchUrl) >= 0:
push_uri = 'https://%s/cgi-bin/mmwebwx-bin' % pushUrl
break

# closeQRImage
if sys.platform.find('darwin') >= 0: # for OSX with Preview
os.system("osascript -e 'quit app \"Preview\"'")
elif code == '408': # 超時
pass
# elif code == '400' or code == '500':

return code


def login():
global skey, wxsid, wxuin, pass_ticket, BaseRequest

r = myRequests.get(url=redirect_uri)
r.encoding = 'utf-8'
data = r.text

# print(data)

doc = xml.dom.minidom.parseString(data)
root = doc.documentElement

for node in root.childNodes:
if node.nodeName == 'skey':
skey = node.childNodes[0].data
elif node.nodeName == 'wxsid':
wxsid = node.childNodes[0].data
elif node.nodeName == 'wxuin':
wxuin = node.childNodes[0].data
elif node.nodeName == 'pass_ticket':
pass_ticket = node.childNodes[0].data

# print('skey: %s, wxsid: %s, wxuin: %s, pass_ticket: %s' % (skey, wxsid,
# wxuin, pass_ticket))

if not all((skey, wxsid, wxuin, pass_ticket)):
return False

BaseRequest = {
'Uin': int(wxuin),
'Sid': wxsid,
'Skey': skey,
'DeviceID': deviceId,
}

return True


def webwxinit():
url = (base_uri +
'/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
params = {'BaseRequest': BaseRequest}
headers = {'content-type': 'application/json; charset=UTF-8'}

r = myRequests.post(url=url, data=json.dumps(params), headers=headers)
r.encoding = 'utf-8'
data = r.json()

if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxinit.json'), 'wb')
f.write(r.content)
f.close()

# print(data)

global ContactList, My, SyncKey
dic = data
ContactList = dic['ContactList']
My = dic['User']
SyncKey = dic['SyncKey']

state = responseState('webwxinit', dic['BaseResponse'])
return state


def webwxgetcontact():
url = (base_uri +
'/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
pass_ticket, skey, int(time.time())))
headers = {'content-type': 'application/json; charset=UTF-8'}

r = myRequests.post(url=url, headers=headers)
r.encoding = 'utf-8'
data = r.json()

if DEBUG:
f = open(os.path.join(os.getcwd(), 'webwxgetcontact.json'), 'wb')
f.write(r.content)
f.close()

dic = data
MemberList = dic['MemberList']

# 倒序遍歷,不然刪除的時候出問題..
SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync",
"floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp", "facebookapp",
"masssendapp",
"meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder", "weixinreminder",
"wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts", "notification_messages", "wxitil",
"userexperience_alarm"]
for i in range(len(MemberList) - 1, -1, -1):
Member = MemberList[i]
if Member['VerifyFlag'] & 8 != 0: # 公眾號/服務號
MemberList.remove(Member)
elif Member['UserName'] in SpecialUsers: # 特殊賬號
MemberList.remove(Member)
elif Member['UserName'].find('@@') != -1: # 群聊
MemberList.remove(Member)
elif Member['UserName'] == My['UserName']: # 自己
MemberList.remove(Member)

return MemberList


def syncKey():
SyncKeyItems = ['%s_%s' % (item['Key'], item['Val'])
for item in SyncKey['List']]
SyncKeyStr = '|'.join(SyncKeyItems)
return SyncKeyStr


def syncCheck():
url = push_uri + '/synccheck?'
params = {
'skey': BaseRequest['Skey'],
'sid': BaseRequest['Sid'],
'uin': BaseRequest['Uin'],
'deviceId': BaseRequest['DeviceID'],
'synckey': syncKey(),
'r': int(time.time()),
}

r = myRequests.get(url=url, params=params)
r.encoding = 'utf-8'
data = r.text

# print(data)

# window.synccheck={retcode:"0",selector:"2"}
regx = r'window.synccheck={retcode:"(\d+)",selector:"(\d+)"}'
pm = re.search(regx, data)

retcode = pm.group(1)
selector = pm.group(2)

return selector


def webwxsync():
global SyncKey

url = base_uri + '/webwxsync?lang=zh_CN&skey=%s&sid=%s&pass_ticket=%s' % (
BaseRequest['Skey'], BaseRequest['Sid'], urllib.quote_plus(pass_ticket))
params = {
'BaseRequest': BaseRequest,
'SyncKey': SyncKey,
'rr': ~int(time.time()),
}
headers = {'content-type': 'application/json; charset=UTF-8'}

r = myRequests.post(url=url, data=json.dumps(params))
r.encoding = 'utf-8'
data = r.json()

# print(data)

dic = data
SyncKey = dic['SyncKey']

state = responseState('webwxsync', dic['BaseResponse'])
return state


def heartBeatLoop():
while True:
selector = syncCheck()
if selector != '0':
webwxsync()
time.sleep(1)


def main():
global myRequests

if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context

headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36'}
myRequests = requests.Session()
myRequests.headers.update(headers)

if not getUUID():
print('獲取uuid失敗')
return

print('正在獲取二維碼圖片...')
showQRImage()

while waitForLogin() != '200':
pass

os.remove(QRImagePath)

if not login():
print('登錄失敗')
return

if not webwxinit():
print('初始化失敗')
return

MemberList = webwxgetcontact()

threading.Thread(target=heartBeatLoop)

MemberCount = len(MemberList)
print('通訊錄共%s位好友' % MemberCount)

d = {}
imageIndex = 0
for Member in MemberList:
imageIndex = imageIndex + 1
# name = 'C:\\Users\\Public\\Pictures\\' + str(imageIndex) + '.jpg'
# imageUrl = 'http://wx2.qq.com' + Member['HeadImgUrl']
# r = myRequests.get(url=imageUrl, headers=headers)
# imageContent = (r.content)
# fileImage = open(name, 'wb')
# fileImage.write(imageContent)
# fileImage.close()
# print('正在下載第:' + str(imageIndex) + '位好友頭像')
d[Member['UserName']] = (Member['NickName'], Member['RemarkName'])
city = Member['City']
city = 'nocity' if city == '' else city
name = Member['NickName']
name = 'noname' if name == '' else name
sign = Member['Signature']
sign = 'nosign' if sign == '' else sign
remark = Member['RemarkName']
remark = 'noremark' if remark == '' else remark
alias = Member['Alias']
alias = 'noalias' if alias == '' else alias
nick = Member['NickName']
nick = 'nonick' if nick == '' else nick
print(name, '|||', city, '|||', Member['Sex'], '|||', Member['StarFriend'], '|||', sign,
'|||', remark, '|||', alias, '|||', nick)



if __name__ == '__main__':
main()
print('回車鍵退出...')
input()
程序運行過程中會跳出二維碼,需要我們掃描登錄
作者原文基於mac,所以我自己修改成了這個樣子(紅色加粗和藍色底紋部分)
subprocess.call(['open', QRImagePath]) 是給linux或mac下用來打開文件的
而windows下要用os.startfile(QRImagePath)(不要問我怎么知道的,我運行報錯后猜出它的作用然后百度的)
(感謝好友的一路陪伴和困厄之時的支持)
微信頭像被保存在對應的文件路徑C:\\Users\\Public\\Pictures\\中
在CSV中:
經過分析(借助了EasyChart的配色):
   
神奇,居然還是女生多?難道是最近加我的微信或者騙子比較多嘛?

文件
參考:
Python對微信好友進行簡單統計分析
當Python遇上微信,可以這么玩

******************************♣******************************
抓取淘寶寶貝信息
代碼如下:
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from taobao.config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
wait = WebDriverWait(browser, 10)
#設置窗口大小,以免默認的相對較小的窗口影響操作
browser.set_window_size(1400, 900)

def search():
print('正在搜索')
try:
browser.get('https://www.taobao.com')
# http://selenium-python.readthedocs.io/waits.html#explicit-waits
# 輸入框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
#搜索提交按鈕
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))

input.send_keys(KEYWORD)
submit.click()

#等待搜索內容加載,表示為共x
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
#total中就是總共有多少頁
get_products()
return total.text
except TimeoutException:
# wait在網速過慢的時候會出現超時錯誤,所以我們遞歸調用重新請求
return search()


def next_page(page_number):
#進入下一頁有多種方式,比如點擊xx頁,比如點擊下一頁按鈕
#比如在輸入框中設置頁碼,點擊確定
#我們采用最后一種,因為第一章容易錯亂
#而最后一種相比於第二種方便我們在出錯的時候遞歸
print('正在翻頁', page_number)
try:
#輸入頁碼框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
#提交按鈕
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
#清楚原先的頁碼
input.clear()
input.send_keys(page_number)
submit.click()
#判斷翻頁是否成功(判斷條件是高亮的當前代碼)
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))
get_products()
except TimeoutException:
#出錯則遞歸,重新執行
next_page(page_number)

#解析
def get_products():
#判斷所有的寶貝信息是否傳遞成功
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
print(product)
save_to_mongo(product)

#保存
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存儲到MONGODB成功', result)
except Exception:
print('存儲到MONGODB失敗', result)


def main():
try:
total = search()
#提取總頁數
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total + 1):
#從第二頁才開始需要點擊下一頁
next_page(i)
except Exception:
print('出錯啦')
finally:
browser.close()

if __name__ == '__main__':
main()
注意,是在Python3下
config文件(上面標紅的config的引入路徑要根據實際情況修改)
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'product'

# http://phantomjs.org/api/command-line.html
#第一個參數是不加載圖片,使程序運行更快
#第二個的開啟緩存
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

KEYWORD = '漢服'

******************************♣******************************
抓取今日頭條
代碼如下:

import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError
from spider_basic.config import *

#
client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB]


#獲取主頁,offseajax動態加載的偏移量,keyword是搜索的關鍵字
def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None

#下載
def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None

#保存
def save_image(content):
# 文件名
# 路徑 名稱 后綴
# 名稱md5 這是為了當我們運行一次程序出錯時,第二次不再保存重復圖片
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'pic',md5(content).hexdigest(), 'jpg')
print(file_path)
# 如果文件不存在則保存
# content是二進制形式的網頁內容
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close()

#解析搜索后返回的網頁(就是圖集XHR中的那些)
def parse_page_index(text):
try:
#轉化為JSON
data = json.loads(text)
#如果JSON數據中含有data這個鍵
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass

#獲取詳情頁
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None

#解析詳情頁
def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
#獲取標題
result = soup.select('title')
title = result[0].get_text() if result else ''
#獲取圖片地址(定義正則規則,並search
images_pattern = re.compile('var gallery = (.*?);', re.S)
result = re.search(images_pattern, html)
#如果非空
if result:
#轉化為JSON格式
data = json.loads(result.group(1))
#如果sub_images這個鍵的值對應了各種url
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
#下載圖片
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
}

#存儲到數據庫
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False


def main(offset):
text = get_page_index(offset, KEYWORD)
#生成器返回圖集中每一個圖集的URL的遍歷器
urls = parse_page_index(text)
#遍歷每一個圖集,獲取詳情頁信息
for url in urls:
#獲取詳情頁
html = get_page_detail(url)
#解析詳情頁
result = parse_page_detail(html, url)
#保存到MongoDB
if result: save_to_mongo(result)


# if __name__ == '__main__':
# main(60)

# 注意,如果不在if __name__ == '__main__':中運行的話,會爆出一堆多線程的錯誤
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
config文件
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

GROUP_START = 1
GROUP_END = 20
KEYWORD='萌寵'






附件列表

     


    免責聲明!

    本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



     
    粵ICP備18138465號   © 2018-2025 CODEPRJ.COM