因為要做去轉盤網(分類模式點我),所以一定要爬取網盤資源,本來想自己寫一個爬蟲挺不容易的,不想分享出來,但最后還是決定了拿給大家一起看吧,畢竟有交流才有進步,有興趣的朋友也可以看看我寫的其他日志或者關注我,會發現去轉盤網的大部分技術現在可以說是公開狀態,如有對你有幫助還是認真讀讀吧,下面是爬蟲代碼,我立馬公開:
ps:不會python的孩子先去學學python,代碼是python寫的
其實還有個磁力站,不過暫時技術不想公開出來,之后也想公開,喜歡的看看:ok搜搜
2019年10月修正文章:
目前自主爬蟲的獲取方式已經基本失效,google自定義的方式獲得的資源質量也不是太高,當前最省事,簡潔的 方式可以采用直接調用接口
推薦用這個平台:
https://www.xiaocongjisuan.com/
網盤接口:
https://www.xiaocongjisuan.com/show/api/2
TEL:繼本篇之后,以下是其他所有重要的博客,喜歡的可以看看:
#coding: utf8
"""
author:haoning
create time: 2015-8-15
"""
import re #正則表達式模塊
import urllib2 #獲取URLs的組件
import time
from Queue import Queue
import threading, errno, datetime
import json
import requests #Requests is an Apache2 Licensed HTTP library
import MySQLdb as mdb
DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = ''
#以下是正則匹配規則
re_start = re.compile(r'start=(\d+)') #\d 表示0-9 任意一個數字 后面有+號 說明這個0-9單個數位出現一到多次 比如21312314
re_uid = re.compile(r'query_uk=(\d+)') #查詢編號
re_urlid = re.compile(r'&urlid=(\d+)') #url編號
ONEPAGE = 20 #一頁數據量
ONESHAREPAGE = 20 #一頁分享連接量
#缺少專輯列表
URL_SHARE = 'http://yun.baidu.com/pcloud/feed/getsharelist?auth_type=1&start={start}&limit=20&query_uk={uk}&urlid={id}' #獲得分享列表
"""
{"feed_type":"share","category":6,"public":"1","shareid":"1541924625","data_id":"2418757107690953697","title":"\u5723\u8bde\u58c1\u7eb8\u5927\u6d3e\u9001","third":0,"clienttype":0,"filecount":1,"uk":1798788396,"username":"SONYcity03","feed_time":1418986714000,"desc":"","avatar_url":"http:\/\/himg.bdimg.com\/sys\/portrait\/item\/1b6bf333.jpg","dir_cnt":1,"filelist":[{"server_filename":"\u5723\u8bde\u58c1\u7eb8\u5927\u6d3e\u9001","category":6,"isdir":1,"size":1024,"fs_id":870907642649299,"path":"%2F%E5%9C%A3%E8%AF%9E%E5%A3%81%E7%BA%B8%E5%A4%A7%E6%B4%BE%E9%80%81","md5":"0","sign":"1221d7d56438970225926ad552423ff6a5d3dd33","time_stamp":1439542024}],"source_uid":"871590683","source_id":"1541924625","shorturl":"1dDndV6T","vCnt":34296,"dCnt":7527,"tCnt":5056,"like_status":0,"like_count":60,"comment_count":19},
public:公開分享
title:文件名稱
uk:用戶編號
"""
URL_FOLLOW = 'http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk={uk}&limit=20&start={start}&urlid={id}' #獲得訂閱列表
"""
{"type":-1,"follow_uname":"\u597d\u55e8\u597d\u55e8\u554a","avatar_url":"http:\/\/himg.bdimg.com\/sys\/portrait\/item\/979b832f.jpg","intro":"\u9700\u8981\u597d\u8d44\u6599\u52a0994798392","user_type":0,"is_vip":0,"follow_count":2,"fans_count":2276,"follow_time":1415614418,"pubshare_count":36,"follow_uk":2603342172,"album_count":0},
follow_uname:訂閱名稱
fans_count:粉絲數
"""
URL_FANS = 'http://yun.baidu.com/pcloud/friend/getfanslist?query_uk={uk}&limit=20&start={start}&urlid={id}' # 獲取關注列表
"""
{"type":-1,"fans_uname":"\u62e8\u52a8\u795e\u7684\u5fc3\u7eea","avatar_url":"http:\/\/himg.bdimg.com\/sys\/portrait\/item\/d5119a2b.jpg","intro":"","user_type":0,"is_vip":0,"follow_count":8,"fans_count":39,"follow_time":1439541512,"pubshare_count":15,"fans_uk":288332613,"album_count":0}
avatar_url:頭像
fans_uname:用戶名
"""
QNUM = 1000
hc_q = Queue(20) #請求隊列
hc_r = Queue(QNUM) #接收隊列
success = 0
failed = 0
def req_worker(inx): #請求
s = requests.Session() #請求對象
while True:
req_item = hc_q.get() #獲得請求項
req_type = req_item[0] #請求類型,分享?訂閱?粉絲?
url = req_item[1] #url
r = s.get(url) #通過url獲得數據
hc_r.put((r.text, url)) #將獲得數據文本和url放入接收隊列
print "req_worker#", inx, url #inx 線程編號; url 分析了的 url
def response_worker(): #處理工作
dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
dbcurr = dbconn.cursor()
dbcurr.execute('SET NAMES utf8')
dbcurr.execute('set global wait_timeout=60000') #以上皆是數據庫操作
while True:
"""
#正則備注
match() 決定 RE 是否在字符串剛開始的位置匹配
search() 掃描字符串,找到這個 RE 匹配的位置
findall() 找到 RE 匹配的所有子串,並把它們作為一個列表返回
finditer() 找到 RE 匹配的所有子串,並把它們作為一個迭代器返回
百度頁面鏈接:http://pan.baidu.com/share/link?shareid=3685432306&uk=1798788396&from=hotrec
uk 其實用戶id值
"""
metadata, effective_url = hc_r.get() #獲得metadata(也就是前面的r.text)和有效的url
#print "response_worker:", effective_url
try:
tnow = int(time.time()) #獲得當前時間
id = re_urlid.findall(effective_url)[0] #獲得re_urlid用戶編號
start = re_start.findall(effective_url)[0] #獲得start用戶編號
if True:
if 'getfollowlist' in effective_url: #type = 1,也就是訂閱類
follows = json.loads(metadata) #以將文本數據轉化成json數據格式返回
uid = re_uid.findall(effective_url)[0] #獲得re_uid,查詢編號
if "total_count" in follows.keys() and follows["total_count"]>0 and str(start) == "0": #獲得訂閱數量
for i in range((follows["total_count"]-1)/ONEPAGE): #開始一頁一頁獲取有用信息
try:
dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 1, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))
#存儲url編號,訂閱中有用戶編號,start表示從多少條數據開始獲取,初始status=0為未分析狀態
except Exception as ex:
print "E1", str(ex)
pass
if "follow_list" in follows.keys(): #如果訂閱者也訂閱了,即擁有follow_list
for item in follows["follow_list"]:
try:
dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)' % (item['follow_uk'], item['follow_uname'], str(tnow)))
#存儲訂閱這的用戶編號,用戶名,入庫時間
except Exception as ex:
print "E13", str(ex)
pass
else:
print "delete 1", uid, start
dbcurr.execute('delete from urlids where uk=%s and type=1 and start>%s' % (uid, start))
elif 'getfanslist' in effective_url: #type = 2,也就是粉絲列表
fans = json.loads(metadata)
uid = re_uid.findall(effective_url)[0]
if "total_count" in fans.keys() and fans["total_count"]>0 and str(start) == "0":
for i in range((fans["total_count"]-1)/ONEPAGE):
try:
dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 2, 0)' % (uid, str(ONEPAGE*(i+1)), str(ONEPAGE)))
except Exception as ex:
print "E2", str(ex)
pass
if "fans_list" in fans.keys():
for item in fans["fans_list"]:
try:
dbcurr.execute('INSERT INTO user(userid, username, files, status, downloaded, lastaccess) VALUES(%s, "%s", 0, 0, 0, %s)' % (item['fans_uk'], item['fans_uname'], str(tnow)))
except Exception as ex:
print "E23", str(ex)
pass
else:
print "delete 2", uid, start
dbcurr.execute('delete from urlids where uk=%s and type=2 and start>%s' % (uid, start))
else: #type=0,也即是分享列表
shares = json.loads(metadata)
uid = re_uid.findall(effective_url)[0]
if "total_count" in shares.keys() and shares["total_count"]>0 and str(start) == "0":
for i in range((shares["total_count"]-1)/ONESHAREPAGE):
try:
dbcurr.execute('INSERT INTO urlids(uk, start, limited, type, status) VALUES(%s, %s, %s, 0, 0)' % (uid, str(ONESHAREPAGE*(i+1)), str(ONESHAREPAGE)))
except Exception as ex:
print "E3", str(ex)
pass
if "records" in shares.keys():
for item in shares["records"]:
try:
dbcurr.execute('INSERT INTO share(userid, filename, shareid, status) VALUES(%s, "%s", %s, 0)' % (uid, item['title'], item['shareid'])) #item['title']恰好是文件名稱
#返回的json信息:
except Exception as ex:
#print "E33", str(ex), item
pass
else:
print "delete 0", uid, start
dbcurr.execute('delete from urlids where uk=%s and type=0 and start>%s' % (uid, str(start)))
dbcurr.execute('delete from urlids where id=%s' % (id, ))
dbconn.commit()
except Exception as ex:
print "E5", str(ex), id
dbcurr.close()
dbconn.close() #關閉數據庫
def worker():
global success, failed
dbconn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'baiduyun', charset='utf8')
dbcurr = dbconn.cursor()
dbcurr.execute('SET NAMES utf8')
dbcurr.execute('set global wait_timeout=60000')
#以上是數據庫相關設置
while True:
#dbcurr.execute('select * from urlids where status=0 order by type limit 1')
dbcurr.execute('select * from urlids where status=0 and type>0 limit 1') #type>0,為非分享列表
d = dbcurr.fetchall()
#每次取出一條數據出來
#print d
if d: #如果數據存在
id = d[0][0] #請求url編號
uk = d[0][1] #用戶編號
start = d[0][2]
limit = d[0][3]
type = d[0][4] #哪種類型
dbcurr.execute('update urlids set status=1 where id=%s' % (str(id),)) #狀態更新為1,已經訪問過了
url = ""
if type == 0: #分享
url = URL_SHARE.format(uk=uk, start=start, id=id).encode('utf-8') #分享列表格式化
#query_uk uk 查詢編號
#start
#urlid id url編號
elif type == 1: #訂閱
url = URL_FOLLOW.format(uk=uk, start=start, id=id).encode('utf-8') #訂閱列表格式化
elif type == 2: #粉絲
url = URL_FANS.format(uk=uk, start=start, id=id).encode('utf-8') #關注列表格式化
if url:
hc_q.put((type, url)) #如果url存在,則放入請求隊列,type表示從哪里獲得數據
#通過以上的url就可以獲得相應情況下的數據的json數據格式,如分享信息的,訂閱信息的,粉絲信息的
#print "processed", url
else: #否則從訂閱者或者粉絲的引出人中獲得信息來存儲,這個過程是爬蟲樹的下一層擴展
dbcurr.execute('select * from user where status=0 limit 1000')
d = dbcurr.fetchall()
if d:
for item in d:
try:
dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 0, 0)' % (item[1], str(ONESHAREPAGE)))
#uk 查詢號,其實是用戶編號
#start 從第1條數據出發獲取信息
#
dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 1, 0)' % (item[1], str(ONEPAGE)))
dbcurr.execute('insert into urlids(uk, start, limited, type, status) values("%s", 0, %s, 2, 0)' % (item[1], str(ONEPAGE)))
dbcurr.execute('update user set status=1 where userid=%s' % (item[1],)) #做個標志,該條數據已經訪問過了
#跟新了分享,訂閱,粉絲三部分數據
except Exception as ex:
print "E6", str(ex)
else:
time.sleep(1)
dbconn.commit()
dbcurr.close()
dbconn.close()
def main():
print 'starting at:',now()
for item in range(16):
t = threading.Thread(target = req_worker, args = (item,))
t.setDaemon(True)
t.start() #請求線程開啟,共開啟16個線程
s = threading.Thread(target = worker, args = ())
s.setDaemon(True)
s.start() #worker線程開啟
response_worker() #response_worker開始工作
print 'all Done at:', now()
本人建個qq群,歡迎大家一起交流技術, 群號:512245829 喜歡微博的朋友關注:轉盤娛樂即可
