最近找工作,爬蟲面試的一個面試題。涉及的反爬還是比較全面的,結果公司要求高,要解決視頻鏈接時效性問題,涼涼。
直接上代碼
import requests import time from datetime import datetime import json import execjs import hashlib import re import csv from zlib import crc32 from base64 import b64decode import random import urllib3 import os import threading from queue import Queue from lxml import etree # 查看js版本信息 # print(execjs.get().name) # 屏蔽ssl驗證警告 urllib3.disable_warnings() """ 需要nodejs環境,需要修改subprocess.py文件內的class Popen(object)類中的__init__(..encode='utf-8)否則調用js文件時會報錯 請求列表頁時.py文件中的ua頭要與js文件中一致,不然很難請求到數據,請求詳情頁時要用ua池否則會封瀏覽器/ip 會有一些空白表格,是因為該賬號七天內為發表內容,或者該賬號被封禁 輸出結果在此文件所在根目錄下/toutiao/ 右鍵運行此py文件,newsign.js文件,toutiao.csv文件需在同一文件夾內 爬取的視頻有時效性 """ # 定義ua池 def headers(): # 各種PC端 user_agent_list = [ # Opera "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", # Firefox "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", # Safari "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", # chrome "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", # 360 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", # 淘寶瀏覽器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", # 獵豹瀏覽器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", # QQ瀏覽器 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", # sogou瀏覽器 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", # maxthon瀏覽器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", # UC瀏覽器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", ] UserAgent = random.choice(user_agent_list) headers = {'User-Agent': UserAgent} return headers headers_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", } # 代理ip proxy = { 'http': '183.57.44.62:808' } # cookies值 cookies = {'s_v_web_id': 'b68312370162a4754efb0510a0f6d394'} # 獲取_signature def get_signature(user_id, max_behot_time): with open('newsign.js', 'r', encoding='utf-8') as f: jsData = f.read() execjs.get() ctx = execjs.compile(jsData).call('tac', str(user_id) + str( max_behot_time)) # 復原TAC.sign(userInfo.id + "" + i.param.max_behot_time) return ctx # 獲取as,cp def get_as_cp(): # 該函數主要是為了獲取as和cp參數,程序參考今日頭條中的加密js文件:home_4abea46.js zz = {} now = round(time.time()) # print(now) # 獲取當前計算機時間 e = hex(int(now)).upper()[2:] # hex()轉換一個整數對象為16進制的字符串表示 # print('e:', e) a = hashlib.md5() # hashlib.md5().hexdigest()創建hash對象並返回16進制結果 # print('a:', a) a.update(str(int(now)).encode('utf-8')) i = a.hexdigest().upper() # print('i:', i) if len(e) != 8: zz = {'as': '479BB4B7254C150', 'cp': '7E0AC8874BB0985'} return zz n = i[:5] a = i[-5:] r = '' s = '' for i in range(5): s = s + n[i] + e[i] for j in range(5): r = r + e[j + 3] + a[j] zz = { 'as': 'A1' + s + e[-3:], 'cp': e[0:3] + r + 'E1' } # print('zz:', zz) return zz # 獲取as,cp,_signature(棄用) def get_js(): f = open(r"juejin.js", 'r', encoding='UTF-8') ##打開JS文件 line = f.readline() htmlstr = '' while line: htmlstr = htmlstr + line line = f.readline() ctx = execjs.compile(htmlstr) return ctx.call('get_as_cp_signature') # print(json.loads(get_js())['as']) # 文章數據 break_flag = [] def wenzhang(url=None, max_behot_time=0, n=0, csv_name=0): max_qingqiu = 50 headers1 = ['發表時間', '標題', '來源', '所有圖片', '文章內容'] first_url = 'https://www.toutiao.com/c/user/article/?page_type=1&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s' % ( url.split('/')[-2], max_behot_time, get_as_cp()['as'], get_as_cp()['cp'], get_signature(url.split('/')[-2], max_behot_time)) while n < max_qingqiu and not break_flag: try: # print(url) r = requests.get(first_url, headers=headers_a, cookies=cookies) data = json.loads(r.text) # print(data) max_behot_time = data['next']['max_behot_time'] if max_behot_time: article_list = data['data'] for i in article_list: try: if i['article_genre'] == 'article': res = requests.get('https://www.toutiao.com/i' + i['group_id'], headers=headers(), cookies=cookies) # time.sleep(1) article_title = re.findall("title: '(.*?)'", res.text) article_content = re.findall("content: '(.*?)'", res.text, re.S)[0] # pattern = re.compile(r"[(a-zA-Z~\-_!@#$%\^\+\*&\\\/\?\|:\.<>{}()';=)*|\d]") # article_content = re.sub(pattern, '', article_content[0]) article_content = article_content.replace('"', '').replace('u003C', '<').replace( 'u003E', '>').replace( '=', '=').replace( 'u002F', '/').replace('\\', '') article_images = etree.HTML(article_content) article_image = article_images.xpath('//img/@src') article_time = re.findall("time: '(.*?)'", res.text) article_source = re.findall("source: '(.*?)'", res.text, re.S) result_time = [] [result_time.append(i) for i in str(article_time[0]).split(' ')[0].replace('-', ',').split(',')] # print(result_time) cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days # print(cha) if 30 < cha <= 32: # print('完成') # break_flag.append(1) # break continue if cha > 32: print('完成') break_flag.append(1) break row = {'發表時間': article_time[0], '標題': article_title[0].strip('"'), '來源': article_source[0],'所有圖片':article_image, '文章內容': article_content.strip()} with open('/toutiao/' + str(csv_name) + '文章.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers1) # f_csv.writeheader() f_csv.writerow(row) print('正在爬取文章:', article_title[0].strip('"'), article_time[0], 'https://www.toutiao.com/i' + i['group_id']) time.sleep(1) else: pass except Exception as e: print(e, 'https://www.toutiao.com/i' + i['group_id']) wenzhang(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) else: pass except KeyError: n += 1 print('第' + str(n) + '次請求', first_url) time.sleep(1) if n == max_qingqiu: print('請求超過最大次數') break_flag.append(1) else: pass except Exception as e: print(e) else: pass # print(max_behot_time) # print(data) # 文章詳情頁數據(已合並到文章數據) def get_wenzhang_detail(url, csv_name=0): headers1 = ['發表時間', '標題', '來源', '文章內容'] res = requests.get(url, headers=headers_a, cookies=cookies) # time.sleep(1) article_title = re.findall("title: '(.*?)'", res.text) article_content = re.findall("content: '(.*?)'", res.text, re.S) pattern = re.compile(r"[(a-zA-Z~\-_!@#$%\^\+\*&\\\/\?\|:\.<>{}()';=)*|\d]") article_content = re.sub(pattern, '', article_content[0]) article_time = re.findall("time: '(.*?)'", res.text) article_source = re.findall("source: '(.*?)'", res.text, re.S) result_time = [] [result_time.append(i) for i in str(article_time[0]).split(' ')[0].replace('-', ',').split(',')] # print(result_time) cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days # print(cha) if cha > 8: return None row = {'發表時間': article_time[0], '標題': article_title[0].strip('"'), '來源': article_source[0], '文章內容': article_content.strip()} with open('/toutiao/' + str(csv_name) + '文章.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers1) # f_csv.writeheader() f_csv.writerow(row) print('正在爬取文章:', article_title[0].strip('"'), article_time[0], url) time.sleep(0.5) return 'ok' # 視頻數據 break_flag_video = [] def shipin(url, max_behot_time=0, csv_name=0, n=0): max_qingqiu = 20 headers2 = ['視頻發表時間', '標題', '來源', '視頻鏈接'] first_url = 'https://www.toutiao.com/c/user/article/?page_type=0&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s' % ( url.split('/')[-2], max_behot_time, get_as_cp()['as'], get_as_cp()['cp'], get_signature(url.split('/')[-2], max_behot_time)) while n < max_qingqiu and not break_flag_video: try: res = requests.get(first_url, headers=headers_a, cookies=cookies) data = json.loads(res.text) # print(data) max_behot_time = data['next']['max_behot_time'] if max_behot_time: video_list = data['data'] for i in video_list: try: start_time = i['behot_time'] video_title = i['title'] video_source = i['source'] detail_url = 'https://www.ixigua.com/i' + i['item_id'] resp = requests.get(detail_url, headers=headers()) r = str(random.random())[2:] url_part = "/video/urls/v/1/toutiao/mp4/{}?r={}".format( re.findall('"video_id":"(.*?)"', resp.text)[0], r) s = crc32(url_part.encode()) api_url = "https://ib.365yg.com{}&s={}".format(url_part, s) resp = requests.get(api_url, headers=headers()) j_resp = resp.json() video_url = j_resp['data']['video_list']['video_1']['main_url'] video_url = b64decode(video_url.encode()).decode() # print((int(str(time.time()).split('.')[0])-start_time)/86400) if 30 < (int(str(time.time()).split('.')[0]) - start_time) / 86400 <= 32: # print('完成') # break_flag_video.append(1) continue if (int(str(time.time()).split('.')[0]) - start_time) / 86400 > 32: print('完成') break_flag_video.append(1) break row = {'視頻發表時間': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)), '標題': video_title, '來源': video_source, '視頻鏈接': video_url} with open('/toutiao/' + str(csv_name) + '視頻.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers2) # f_csv.writeheader() f_csv.writerow(row) print('正在爬取視頻:', video_title, detail_url, video_url) time.sleep(3) except Exception as e: print(e, 'https://www.ixigua.com/i' + i['item_id']) shipin(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) except KeyError: n += 1 print('第' + str(n) + '次請求', first_url) time.sleep(3) if n == max_qingqiu: print('請求超過最大次數') break_flag_video.append(1) except Exception as e: print(e) else: pass # 微頭條 break_flag_weitoutiao = [] def weitoutiao(url, max_behot_time=0, n=0, csv_name=0): max_qingqiu = 20 headers3 = ['微頭條發表時間', '來源', '標題', '文章內圖片', '微頭條內容'] while n < max_qingqiu and not break_flag_weitoutiao: try: first_url = 'https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=%s&max_behot_time=%s' % ( url.split('/')[-2], max_behot_time) # print(first_url) res = requests.get(first_url, headers=headers_a, cookies=cookies) data = json.loads(res.text) # print(data) max_behot_time = data['next']['max_behot_time'] weitoutiao_list = data['data'] for i in weitoutiao_list: try: detail_url = 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id']) # print(detail_url) resp = requests.get(detail_url, headers=headers(), cookies=cookies) start_time = re.findall("time: '(.*?)'", resp.text, re.S) weitoutiao_name = re.findall("name: '(.*?)'", resp.text, re.S) weitoutiao_title = re.findall("title: '(.*?)'", resp.text, re.S) weitoutiao_images = re.findall('images: \["(.*?)"\]',resp.text,re.S) # print(weitoutiao_images) if weitoutiao_images: weitoutiao_image = 'http:' + weitoutiao_images[0].replace('u002F','/').replace('\\','') # print(weitoutiao_image) else: weitoutiao_image = '此頭條內無附件圖片' weitoutiao_content = re.findall("content: '(.*?)'", resp.text, re.S) result_time = [] [result_time.append(i) for i in str(start_time[0]).split(' ')[0].replace('-', ',').split(',')] # print(result_time) cha = ( datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days # print(cha) if cha > 30: break_flag_weitoutiao.append(1) print('完成') break row = {'微頭條發表時間': start_time[0], '來源': weitoutiao_name[0], '標題': weitoutiao_title[0].strip('"'),'文章內圖片': weitoutiao_image, '微頭條內容': weitoutiao_content[0].strip('"')} with open('/toutiao/' + str(csv_name) + '微頭條.csv', 'a', newline='', encoding='gb18030')as f: f_csv = csv.DictWriter(f, headers3) # f_csv.writeheader() f_csv.writerow(row) time.sleep(1) print('正在爬取微頭條', weitoutiao_name[0], start_time[0], detail_url) except Exception as e: print(e, 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id'])) weitoutiao(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n) except KeyError: n += 1 print('第' + str(n) + '次請求') time.sleep(2) if n == max_qingqiu: print('請求超過最大次數') break_flag_weitoutiao.append(1) else: pass except Exception as e: print(e) else: pass # 獲取需要爬取的網站數據 def csv_read(path): data = [] with open(path, 'r', encoding='gb18030') as f: reader = csv.reader(f, dialect='excel') for row in reader: data.append(row) return data # 啟動函數 def main(): for j, i in enumerate(csv_read('toutiao-suoyou.csv')): # data_url = data.get_nowait() if '文章' in i[3]: # 啟動抓取文章函數 print('當前正在抓取文章第', j, i[2]) headers1 = ['發表時間', '標題', '來源', '所有圖片', '文章內容'] with open('/toutiao/' + i[0] + '文章.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers1) f_csv.writeheader() break_flag.clear() wenzhang(url=i[2], csv_name=i[0]) if '視頻' in i[3]: # 啟動爬取視頻的函數 print('當前正在抓取視頻第', j, i[2]) headers2 = ['視頻發表時間', '標題', '來源', '視頻鏈接'] with open('/toutiao/' + i[0] + '視頻.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers2) f_csv.writeheader() break_flag_video.clear() shipin(url=i[2], csv_name=i[0]) if '微頭條' in i[3]: # 啟動獲取微頭條的函數 headers3 = ['微頭條發表時間', '來源', '標題', '文章內圖片', '微頭條內容'] print('當前正在抓取微頭條第', j, i[2]) with open('/toutiao/' + i[0] + '微頭條.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers3) f_csv.writeheader() break_flag_weitoutiao.clear() weitoutiao(url=i[2], csv_name=i[0]) # 多線程啟用 def get_all(urlQueue): while True: try: # 不阻塞的讀取隊列數據 data_url = urlQueue.get_nowait() # i = urlQueue.qsize() except Exception as e: break # print(data_url) # if '文章' in data_url[3]: # # 啟動抓取文章函數 # print('當前正在抓取文章', data_url[2]) # headers1 = ['發表時間', '標題', '來源', '所有圖片', '文章內容'] # with open('/toutiao/' + data_url[0] + '文章.csv', 'a', newline='')as f: # f_csv = csv.DictWriter(f, headers1) # f_csv.writeheader() # break_flag.clear() # wenzhang(url=data_url[2], csv_name=data_url[0]) if '視頻' in data_url[3]: # 啟動爬取視頻的函數 print('當前正在抓取視頻', data_url[2]) headers2 = ['視頻發表時間', '標題', '來源', '視頻鏈接'] with open('/toutiao/' + data_url[0] + '視頻.csv', 'a', newline='')as f: f_csv = csv.DictWriter(f, headers2) f_csv.writeheader() break_flag_video.clear() shipin(url=data_url[2], csv_name=data_url[0]) # # if '微頭條' in data_url[3]: # # 啟動獲取微頭條的函數 # headers3 = ['微頭條發表時間', '來源', '標題','文章內圖片', '微頭條內容'] # print('當前正在抓取微頭條', data_url[2]) # with open('/toutiao/' + data_url[0] + '微頭條.csv', 'a', newline='')as f: # f_csv = csv.DictWriter(f, headers3) # f_csv.writeheader() # break_flag_weitoutiao.clear() # weitoutiao(url=data_url[2], csv_name=data_url[0]) if __name__ == '__main__': # 創建存儲目錄 path = '/toutiao/' if not os.path.exists(path): os.mkdir(path) """單一腳本使用main函數,開啟多線程按照下面方法控制線程數,開啟多線程會請求過於頻繁,導致頭條反爬封ip等,需要設置代理ip""" # main() urlQueue = Queue() for j, i in enumerate(csv_read('toutiao-suoyou.csv')): urlQueue.put(i) # print(urlQueue.get_nowait()) # print(urlQueue.qsize()) threads = [] # 可以調節線程數, 進而控制抓取速度 threadNum = 4 for i in range(0, threadNum): t = threading.Thread(target=get_all, args=(urlQueue,)) threads.append(t) for t in threads: # 設置為守護線程,當守護線程退出時,由它啟動的其它子線程將同時退出, # t.setDaemon(True) t.start() for t in threads: # 多線程多join的情況下,依次執行各線程的join方法, 這樣可以確保主線程最后退出, 且各個線程間沒有阻塞 t.join() # pass
讀取csv文件中的用戶信息
抓取的結果
內容僅供參考學習使用,有意見可聯系作者刪除。。。。。。
求份爬蟲工作