一 .案例
1.使用requuests(get,post) 和urllib 爬取數據
https://www.cnblogs.com/lovershowtime/p/11771338.html
簡單爬取搜狗保存到本地
import requests ret=requests.get(url="https://www.sogou.com") aa=ret.text print(ret.text) with open("aa.html","w",encoding="utf-8") as f: f.write(aa)
get爬取搜狗輸入的信息寫入本地
import requests wd=input("輸入參數哈哈:") param={ 'query':wd } ret=requests.get(url="https://www.sogou.com",params=param) # params 參數動態數據封裝成字典 print(ret) fime= # aa=ret.text # 返回的是字符串 aa=ret.content # 返回二進制數據 with open("bb.html","wb") as f: f.write(aa) print("完成")
post爬取百度翻譯數據
import requests wd=input("輸入參數哈哈:") data={ 'query':wd } ret=requests.post(url="https://fanyi.baidu.com/sug",data=data) print(ret.content) #b'{"errno":1001,"errmsg":"\\u53c2\\u6570\\u9519\\u8bef"}' 返回是二進制 print(ret.text) # {"errno":1001,"errmsg":"\u53c2\u6570\u9519\u8bef"} 返回二進制 print(ret.json()) # {'errno': 1001, 'errmsg': '參數錯誤'} 返回字典對象 但是需要返回來的數據是json數據不然會報錯
get爬取豆瓣電影
# https://movie.douban.com/j/chart/top_list?type=20&interval_id=100%3A90&action=&start=140 import requests date={ 'type':'5', 'interval_id':'100:90', 'action':'', 'start':'1', 'limit':'23' } res=requests.get(url="https://movie.douban.com/j/chart/top_list?",params=date) print(res.json())
post爬取肯德基
# https://movie.douban.com/j/chart/top_list?type=20&interval_id=100%3A90&action=&start=140 import requests keyword=input("請輸入城市:") date={ 'cname':'', 'pid':'', 'keyword':keyword, 'pageIndex':'1', 'pageSize':'10' } res=requests.post(url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword",data=date) print(res.url) print(res.json())
post爬取化妝生成許可證詳情 動態數據
# 獲取動態數據 ajax 偽裝瀏覽器反扒機制 http://125.35.6.84:81/xk/ import requests headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } aa="http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList" id_list=[] for page in range(1,11): data={ 'on':'true', 'page':str(page), 'pageSize':'5', 'productName':'', 'conditionType':'1', 'applyname':'', 'applysn':'' } res = requests.post(url=aa, data=data,headers=headers).json() # print(res) # print(res["list"]) for dic in res["list"]: id=dic["ID"] id_list.append(id) print(id_list) id_url='http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for li_id in id_list: id_data={ 'id':li_id } ret=requests.post(url=id_url,data=id_data,headers=headers).json() print(ret)
get爬取圖片寫入本地
import requests import urllib aa='http://d.hiphotos.baidu.com/album/pic/item/b58f8c5494eef01f8931cc7ae1fe9925bc317d6c.jpg?psign=8931cc7ae1fe9925bc315c6034a85edf8cb1cb1349545954' headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } res=requests.get(url=aa,headers=headers).content with open("./aa.jpg","wb") as f: f.write(res) urllib.request.urlretrieve(url=aa,filename="./11.jpg") # 和上面一樣
爬取登錄抽屜點贊(get+post)
# 先查看首頁 # 提交用戶名和密碼 # 一般 :直接發送請求 獲取結果 抽屜 套路: 第一次登錄成功 任意訪問一個頁面 這時候已經把cookies返回了 在第二登錄要把cookies 帶上 經行授權 要是再次登陸要把cookie帶上 在進行授權 import requests # 1. 查看首頁 r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } )
# 2. 提交用戶名和密碼 r2 = requests.post( url='https://dig.chouti.com/login', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, data={ #登錄發送 要把數據帶上 'phone':'8617380117935', 'password':'lv5555555', 'oneMonth':1 }, cookies=r1.cookies.get_dict() ) print(r2.text) print(r2.cookies.get_dict() ) # 這個cokies 混淆我們
# 3. 點贊 r3 = requests.post( url='https://dig.chouti.com/link/vote?linksId=20435396', #點贊的url headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, cookies=r1.cookies.get_dict() # 帶r1cookie ) print(r3.text) print(r1.cookies.get_dict(),11111111111111111111111)
2. 使用requuests 正則爬取數據
https://www.cnblogs.com/lovershowtime/p/11776549.html
正則爬取糗事百科圖片下載
# https://www.qiushibaike.com/ import re,os import requests,urllib url="https://www.qiushibaike.com/pic/page/%d/?s=5170552" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } if not os.path.exists("./img"): os.mkdir("./img") start=int(input("請輸入起始頁:")) end=int(input("請輸入結束頁:")) for page in range(start,end+1): new_url=format(url%page) # https://www.qiushibaike.com/pic/page/1/?s=5170552 print(new_url) page_text=requests.get(url=new_url,headers=headers).text img_url_list=re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?></div>',page_text,re.S) for img_url in img_url_list: img_urls='https'+img_url imgname=img_urls.split("/")[-1] imgpath="img/"+imgname urllib.request.urlretrieve(url=img_url,filename=imgpath) print("下載成功了") # re.findall(正則匹配的格式,匹配的對象) # . 任意字符 # * 零個或者多個字符 # ? 零個或者多個字符 # # re.S 匹配包括換行在內的所有字符 aa="http://img95.699pic.com/photo/50045/7601.jpg_wh300.jpg" print(aa.split("/")) # print(aa.split("/")[-1]) # 7601.jpg_wh300.jpg
3. 使用requuests BeautifulSoup爬取數據
https://www.cnblogs.com/lovershowtime/p/11771726.html
爬取古詩詞網內容寫入本地
import re,os import requests,urllib from bs4 import BeautifulSoup url="http://www.shicimingju.com/book/sanguoyanyi.html" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } pang_text=requests.get(url=url,headers=headers).text sup=BeautifulSoup(pang_text,"lxml") list_li=sup.select('.book-mulu>ul>li>a') fp=open("aa.txt","w",encoding="utf-8") for a in list_li: title=a.string print(title) urls_text='http://www.shicimingju.com'+a["href"] print(urls_text) # http://www.shicimingju.com/book/nanbeishiyanyi/10.html pa_test=requests.get(url=urls_text,headers=headers).text sup = BeautifulSoup(pa_test, "lxml") cont=sup.find('div',class_='chapter_content').text fp.write(title+'\n'+cont) print(title) fp.close()
爬取汽車之家新聞 和下載圖片到本地
import requests from bs4 import BeautifulSoup # 將html格式的字符串解析成對象 對象.fand 對象.find_all response = requests.get("https://www.autohome.com.cn/news/") response.encoding = 'gbk' soup = BeautifulSoup(response.text,'html.parser') # 解析 div = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'}) # 查找標簽 這個標簽里面包含了所有新聞 print(div) li_list = div.find_all(name='li') for li in li_list: title = li.find(name='h3') # 找標題 if not title: continue p = li.find(name='p') a = li.find(name='a') print(title.text) print(a.attrs.get('href')) # 獲取屬性 print(p.text,"1111111111111111111111111111111111111111111111") # 獲取圖片 img = li.find(name='img') src = img.get('src') src = "https:" + src print(src,5555) # 再次發起請求,下載圖片 file_name = src.rsplit('/',maxsplit=1)[1] ret = requests.get(src) with open(file_name,'wb') as f: f.write(ret.content)
爬取抽屜標題內容標題
import requests from bs4 import BeautifulSoup r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) soup = BeautifulSoup(r1.text,'html.parser') # 標簽對象 content_list = soup.find(name='div',attrs={"class":"link-con"}) # print(content_list) # [標簽對象,標簽對象] item_list = content_list.find_all(name='div',attrs={'class':'link-detail'}) for item in item_list: a = item.find(name='a',attrs={'class':'link-title link-statistics'}) print(a.text.strip()) print(a["href"]) # print(a.text)
4. 使用requuests Xpath爬取數據
https://www.cnblogs.com/lovershowtime/p/11777009.html
使用xpath爬取二手房信息
import requests from lxml import etree url="https://cd.58.com/ershoufang/?utm_source=sem-sales-baidu-pc&spm=82881519251.21430224112&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } f=open("ab.txt","w",encoding="utf-8") ret=requests.get(url=url,headers=headers).text tree=etree.HTML(ret) list_le=tree.xpath("//ul[@class='house-list-wrap']/li") print(list_le) for el in list_le: title=el.xpath("./div[2]/h2/a/text()")[0] # 當前第二個div下的h2 下的a的文本 price = el.xpath("./div[3]//text()") # 當前第三個div下的 所有的文本 pi=''.join(price) f.write(title+":"+pi+"\n") f.close() # /html/body/div[5]/div[5]/div[1]/ul/li[1]/div[2] # # /html/body/div[5]/div[5]/div[1]/ul/li[1]/div[2]/h2
使用xpath爬取圖片下載在本地
# http://pic.netbian.com/4kmeinv/ import urllib import requests,os from lxml import etree url="http://pic.netbian.com/4kmeinv/" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } if not os.path.exists("./imgs"): os.mkdir("./imgs") ret=requests.get(url=url,headers=headers) # ret.encoding="utf-8" 一般解決方式 ret_li=etree.HTML(ret.text) li_list=ret_li.xpath("//div[@class='slist']/ul/li") for li in li_list: li_name=li.xpath("./a/b/text()")[0] # 處理亂碼 li_img_name=li_name.encode('ISO-8859-1').decode("gbk") # 萬能解決編碼的問題 哪里亂碼解決哪里 也可以使用全部 img_url="http://http://pic.netbian.com/"+li.xpath("./a/img/@src")[0] img_path='./imgs/'+li_img_name+'.jpg' urllib.request.urlretrieve(url=img_url, filename=img_path) print( img_path,"下載完成")
# 爬取圖片加密 # 數據加密(反扒機制) # base64 返回是二進制 import urllib import requests, os from lxml import etree import base64 url = "http://jandan.net/ooxx" headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ret = requests.get(url=url, headers=headers).text print(ret) ret_li=etree.HTML(ret) li_list = ret_li.xpath("//span[@class='img_hash']/text()") for img_hash in li_list: img_url="http:"+base64.b64decode(img_hash).decode() # 解析加密的圖片路徑 () img_name=img_url.split("/")[-1] urllib.request.urlretrieve(url=img_url, filename=img_name)
爬取 下載簡歷模板 import urllib,random import requests, os from lxml import etree headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } url = "http://sc.chinaz.com/jianli/free_%d.html" # 頁嗎 for page in range(1,4): if page==1: new_url="http://sc.chinaz.com/jianli/free.html" # 免費模板 else: new_url=format(url%page) ret = requests.get(url= new_url, headers=headers) ret.encoding='utf-8' #解決亂碼的問題 tree=etree.HTML(ret.text) div_list=tree.xpath("//div[@id='container']/div") for div in div_list: det_url=div.xpath("./a/@href")[0] # 定位連接中href屬性 name=div.xpath("./a/img/@alt")[0] # 定義到圖片的名字 print(name) # dat_page=requests.get(url=det_url,headers=headers).text trees = etree.HTML(dat_page) dowloand_list=trees.xpath("//div[@class='clearfix mt20 downlist']/ul/li/a/@href") dow_url=random.choice(dowloand_list) data=requests.get(url=dow_url,headers=headers).content fileName=name+".rar" with open(fileName,"wb") as f: f.write(data) print(fileName,"下載成功了哈哈哈") 問題:往往在進行大量請求發送的時候 經常會報一個這樣的錯誤 HTTPconnrctionFool....with url 原因: 1 : 每次數據傳輸前客服端要和服務器端建立tcp連接 為了節省傳輸消耗的時間 默認是keep-alive 即連接一次 然而如果遲遲不斷連接的化,則連接池滿后則無法產生新的連接對象 導致請求無法發送 2. IP 被封 3. 請求過於頻繁 解決方法: 設置請求頭中conntection的值設為close 表示每次請求成功后斷開連接 更換ip 每次請求之間使用sleep進行等待時間間隔
爬取城市數據
import urllib,random import requests, os from lxml import etree headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } url="https://www.aqistudy.cn/historydata/" ret=requests.get(url=url,headers=headers).text tree=etree.HTML(ret) li_lit=tree.xpath("//div[@class='bottom']/ul/li | //div[@class='bottom']/ul/div[2]/li") # Xpath表達式 for li in li_lit: cont=li.xpath("./a/text()")[0] print(cont)
4. 代理IP使用(用戶代理 和代理池)
用戶代理池

# # http://www.goubanjia.com/ 全網代理IP https://www.kuaidaili.com/ 快代理 # # 反扒機制代理ip # 設置請求的代理ip # # 代理ip的類型必須和請求url的協議保存一至 import requests 方式一: IP代理構建 url="https://www.baidu.com/s?wd=ip" headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } ret=requests.get(url=url,headers=headers,proxies={"https":'222.184.59.8:808'}).text print(ret) with open("./ip.html","w",encoding="utf-8") as f: f.write(ret)
方式二: IP代理構建實戰(urllib模塊) import urllib.request ip='119.23.79.199:3128' proxy=urllib.request.ProxyHandler({"http":ip}) openers=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(openers) url="http://www.baidu.com" data=urllib.request.urlopen(url).read().decode("utf-8","ignore") print(len(data)) f=open("bb.html",'w',encoding="utf-8") f.write(data) f.close()
方式一:構建代理池 # ip代理池的構建 適用於代理ip穩定的情況下 import random import urllib.request # 代理池 pools=[ "119.23.79.199:3128", "221.224.163.54:808", "210.26.64.44:3128", "27.191.234.69:9999", ] def ip(pools): ips=random.choice(pools) proxy = urllib.request.ProxyHandler({"http": ips}) openers = urllib.request.build_opener(proxy, urllib.request.HTTPHandler) urllib.request.install_opener(openers) for i in range(0,5): try: # 代理池ip不穩定需要異常捕獲 ip(pools) url = "http://www.baidu.com" data = urllib.request.urlopen(url).read().decode("gbk", "ignore") print(len(data)) f=open("ss.html","w") f.write(data) except Exception as err: print(err,"1111")
方式二: 代理池 Ip代理 接口調用法 這種代理適用於代理ip穩定
import random import urllib.request def ip(): # 這是大象代理ip接口 http://daxiangdaili.com/api ips=urllib.request.urlopen("http://www.daxiangdaili.com/ip/?tid=559126871522587&num=2").read().decode("utf-8", "ignore") proxy = urllib.request.ProxyHandler({"http": ips}) openers = urllib.request.build_opener(proxy, urllib.request.HTTPHandler) urllib.request.install_opener(openers) for i in range(0,5): try: # 代理池ip不穩定需要異常捕獲 ip() url = "http://www.baidu.com" data = urllib.request.urlopen(url).read().decode("gbk", "ignore") print(len(data)) f=open("ss.html","w") f.write(data) except Exception as err: print(err,"1111")
方式三: 代理池 Ip代理
from bs4 import BeautifulSoup import requests import random # 2獲取網頁內容函數 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } def getHTMLText(url,proxies): try: r = requests.get(url,proxies=proxies) r.raise_for_status() r.encoding = r.apparent_encoding except: return 0 else: return r.text # 3.從代理ip網站獲取代理ip列表函數,並檢測可用性,返回ip列表 def get_ip_list(url): web_data = requests.get(url,headers) soup = BeautifulSoup(web_data.text, 'html') ips = soup.find_all('tr') ip_list = [] for i in range(1, len(ips)): ip_info = ips[i] tds = ip_info.find_all('td') ip_list.append(tds[1].text + ':' + tds[2].text) #檢測ip可用性,移除不可用ip:(這里其實總會出問題,你移除的ip可能只是暫時不能用,剩下的ip使用一次后可能之后也未必能用) for ip in ip_list: try: proxy_host = "https://" + ip proxy_temp = {"https": proxy_host} res = urllib.urlopen(url, proxies=proxy_temp).read() except Exception as e: ip_list.remove(ip) continue return ip_list # 4.從ip池中隨機獲取ip列表 def get_random_ip(ip_list): proxy_list = [] for ip in ip_list: proxy_list.append('http://' + ip) proxy_ip = random.choice(proxy_list) proxies = {'http': proxy_ip} return proxies # 5.調用代理 if __name__ == '__main__': url = 'http://www.xicidaili.com/nn/' ip_list = get_ip_list(url) proxies = get_random_ip(ip_list) print(proxies)
5.使用模擬登錄 爬取數據
人人網模擬登錄 抓包 驗證碼獲取(使用了雲打碼平台http://www.yundama.com/) session使用
# http://www.yundama.com/ 雲打碼
# Superme888888@outlook.com
# supreme9999
# @_XJQ1995110
# 17380117935
# # http://www.renren.com/SysHome.do
import requests,urllib from lxml import etree session=requests.session() # 獲取session對象 import http.client, mimetypes, urllib, json, time, requests ###################################################################### class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text def getCode(username,pwd,codePath,codeType): # 用戶名(普通用戶) username = username # 密碼 password = pwd # 軟件ID,開發者分成必要參數。登錄開發者后台【我的軟件】獲得! appid =9406 # 軟件密鑰,開發者分成必要參數。登錄開發者后台【我的軟件】獲得! appkey = '4b671243618fff6a87ebbe33446d09e3' # 圖片文件 filename = codePath # 驗證碼類型,# 例:1004表示4位字母數字,不同類型收費不同。請准確填寫,否則影響識別率。在此查詢所有類型 http://www.yundama.com/price.html codetype = codeType # 超時時間,秒 timeout = 80 result=None # 檢查 if (username == 'username'): print('請設置好相關參數再測試') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陸雲打碼 uid = yundama.login(); print('uid: %s' % uid) # 查詢余額 balance = yundama.balance(); print('balance: %s' % balance) # 開始識別,圖片路徑,驗證碼類型ID,超時時間(秒),識別結果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) return result # 模擬登錄 url="http://www.renren.com/" headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } ret=requests.get(url=url,headers=headers).text terr=etree.HTML(ret) code_img_url=terr.xpath("//*[@id='verifyPic_login']/@src")[0] # 獲取驗證碼 urllib.request.urlretrieve(url=code_img_url,filename="code.jpg") # 識別驗證碼中的數據值 顯示去識別 code_data=getCode("supreme9999","@_XJQ1995110","./code.jpg",2004) print(code_data) # 獲取登錄發送的數據 login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019100151870" data={"email":"17380117935", "icode" :code_data, "origURL":"http://www.renren.com/home", "domain":"renren.com", "key_id ":1, "captcha_type":"web_login", "password":"7f68692e5e69afa1ba418b799ec63a0a", "rkey":"7f68692e5e69afa1ba418b799ec63a0a", "f":"http%3A%2F%2Fwww.renren.com%2F972764841%2Fprofile", } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } # requests登錄 請求發送成功后cookies不會存儲到requests中 # get_cont=requests.post(url=login_url,data=data,headers=headers) # 該次請求登錄會 產生的cookies 會被自動存儲到session對象中 (注意只有登錄成功后才會產生session) session.post(url=login_url,data=data,headers=headers) urls="http://www.renren.com/972764841/profile" #登錄成功后跳轉到主頁 pag_text=session.get(url=urls,headers=headers).text with open("ren.html","w",encoding="utf-8") as f: f.write(pag_text)
模擬登錄古詩文網
# http://www.yundama.com/ 雲打碼 # Superme888888@outlook.com # supreme9999 # @_XJQ1995110 # 17380117935 import http.client, mimetypes, urllib, json, time import requests from lxml import etree # 獲取session對象 # 獲取古詩文網 ###################################################################### class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text def getCode(username,pwd,codePath,codeType): # 用戶名(普通用戶) username = username # 密碼 password = pwd # 軟件ID,開發者分成必要參數。登錄開發者后台【我的軟件】獲得! appid =9406 # 軟件密鑰,開發者分成必要參數。登錄開發者后台【我的軟件】獲得! appkey = '4b671243618fff6a87ebbe33446d09e3' # 圖片文件 filename = codePath # 驗證碼類型,# 例:1004表示4位字母數字,不同類型收費不同。請准確填寫,否則影響識別率。在此查詢所有類型 http://www.yundama.com/price.html codetype = codeType # 超時時間,秒 timeout = 80 result=None # 檢查 if (username == 'username'): print('請設置好相關參數再測試') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陸雲打碼 uid = yundama.login(); print('uid: %s' % uid) # 查詢余額 balance = yundama.balance(); print('balance: %s' % balance) # 開始識別,圖片路徑,驗證碼類型ID,超時時間(秒),識別結果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) return result # 獲取驗證碼 s =requests.Session() url="https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx" headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } ret=requests.get(url=url,headers=headers).text terr=etree.HTML(ret) img_src='https://so.gushiwen.org'+terr.xpath("//*[@id='imgCode']/@src")[0] print(img_src) img_data=s.get(url=img_src,headers=headers).content with open("./cc.jpg","wb") as f: f.write(img_data) img_text=getCode("Superme888888@outlook.com"," @_XJQ1995110","./cc.jpg",1004) print(img_text) # 模擬登錄 url="https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx " date={ "__VIEWSTATE":"DmBrtFoRGX4MZ4I+urA1bNT3UpnZRkyA7O/9XO1azxff3G35mKDbCmAunAB+TZAZF6HpQunWGe82fhPXwgs/DVfRY9h/LBljRx97fxgOE7+AkMu12yNZsyIZs1I=", # 這個可以去頁面獲取它是隱藏了的 "__VIEWSTATEGENERATOR":"C93BE1AE", "from":"http://so.gushiwen.org/user/collect.aspx", "email":"Superme888888@outlook.com", "pwd":"@_XJQ1995110", "code":"f0r5", "denglu":"登錄", } tesrs=s.post(url=url,headers=headers,data=date).text with open("./aa.html","w",encoding="utf-8") as f: f.write(tesrs)
6.爬取圖片(懶加載)
# 在網頁源碼中在img標簽中首先會使用一個 偽類屬性(通常會使用src2, originai...)
# 去存放正真的圖片連接 並非直接放在src屬性中 當圖片出現在可視化區域中會動態將偽屬性 替換成src屬性 完成懶加載
url="http://sc.chinaz.com/tupian/xixirenti.html" import requests from lxml import etree headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ret=requests.get(url=url,headers=headers) ret.encoding="utf-8" tests=ret.text tree=etree.HTML(tests) div_list=tree.xpath("//div[@id='container']/div") for div in div_list: img_url=div.xpath(".//img/@src") print(img_url) img_name= div.xpath(".//img/@alt") print(img_name) # 爬取到的圖片為空
url="http://sc.chinaz.com/tupian/xixirenti.html" import requests from lxml import etree headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ret=requests.get(url=url,headers=headers) ret.encoding="utf-8" tests=ret.text tree=etree.HTML(tests) div_list=tree.xpath("//div[@id='container']/div") for div in div_list: a_url = div.xpath(".//a/@href") print(a_url) img_url=div.xpath(".//img/@src2") print(img_url) img_name= div.xpath(".//img/@alt") print(img_name)
import requests import time from lxml import etree # xpath解析 # 定義站長素材圖片的存儲路徑, 文件夾需要先創建好 IMAGE_PATH = 'img/' def spider_image(page): if page == 1: url = 'http://sc.chinaz.com/tupian/' else: url = 'http://sc.chinaz.com/tupian/index_%s.html' % page # 定制請求頭信息 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" "Chrome/74.0.3729.108 Safari/537.36", } # 對頁面發起請求 response = requests.get(url=url, headers=headers) # 需要對響應的頁面進行編碼, 否則拿到的會是亂碼的數據 response.encoding = 'utf-8' # 解析數據,獲取圖片的img標簽 tree = etree.HTML(response.text) img_list = tree.xpath('//div[@id="container"]/div/div/a/img') # 循環遍歷所有img標簽 for img in img_list: # 拿取圖片標題 title = img.xpath('./@alt')[0] """ 此處拿的src是懶加載時的圖片路徑,當使用get請求頁面拿取到的時頁面的源碼,此時 還沒有動態加載頁面,所以src還是懶加載的,只需要拿取懶加載的src2即可 """ src = img.xpath('./@src2')[0] # 訪問圖片頁面 res = requests.get(url=src, headers=headers) # 圖片必須以二進制流來寫入 with open(IMAGE_PATH + '%s.jpg' % title, 'wb') as f: f.write(res.content) if __name__ == '__main__': # 循環爬取指定頁碼的圖片 start_time = time.time() for i in range(1, 3): spider_image(i) time.sleep(2) end_time = time.time() print("總耗時:%s" % (end_time - start_time))
