五. 爬蟲案例(前面知識點整合)

本文轉載自查看原文 2019-11-01 15:31 319 第十三篇爬蟲

一 .案例

1.使用requuests(get,post) 和urllib 爬取數據

https://www.cnblogs.com/lovershowtime/p/11771338.html

簡單爬取搜狗保存到本地

import requests
ret=requests.get(url="https://www.sogou.com")
aa=ret.text
print(ret.text)

with open("aa.html","w",encoding="utf-8") as f:
       f.write(aa)

get爬取搜狗輸入的信息寫入本地

import requests
wd=input("輸入參數哈哈:")
param={
    'query':wd
}
ret=requests.get(url="https://www.sogou.com",params=param) #  params 參數動態數據封裝成字典
print(ret)
fime=
# aa=ret.text  #  返回的是字符串
aa=ret.content  # 返回二進制數據
with open("bb.html","wb") as  f:
       f.write(aa)
       print("完成")

post爬取百度翻譯數據

import requests
wd=input("輸入參數哈哈:")
data={
    'query':wd
}
ret=requests.post(url="https://fanyi.baidu.com/sug",data=data)
print(ret.content) #b'{"errno":1001,"errmsg":"\\u53c2\\u6570\\u9519\\u8bef"}'     返回是二進制
print(ret.text) # {"errno":1001,"errmsg":"\u53c2\u6570\u9519\u8bef"}    返回二進制
print(ret.json())  # {'errno': 1001, 'errmsg': '參數錯誤'}   返回字典對象  但是需要返回來的數據是json數據不然會報錯

get爬取豆瓣電影

# https://movie.douban.com/j/chart/top_list?type=20&interval_id=100%3A90&action=&start=140
import requests
date={
    'type':'5',
    'interval_id':'100:90',
    'action':'',
    'start':'1',
    'limit':'23'
}
res=requests.get(url="https://movie.douban.com/j/chart/top_list?",params=date)
print(res.json())

post爬取肯德基

# https://movie.douban.com/j/chart/top_list?type=20&interval_id=100%3A90&action=&start=140
import requests

keyword=input("請輸入城市:")
date={
    'cname':'',
    'pid':'',
    'keyword':keyword,
    'pageIndex':'1',
    'pageSize':'10'
}
res=requests.post(url="http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword",data=date)
print(res.url)
print(res.json())

post爬取化妝生成許可證詳情 動態數據

# 獲取動態數據  ajax             偽裝瀏覽器反扒機制   http://125.35.6.84:81/xk/
import requests

headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
aa="http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"

id_list=[]
for page in range(1,11):
    data={
        'on':'true',
        'page':str(page),
        'pageSize':'5',
        'productName':'',
        'conditionType':'1',
        'applyname':'',
        'applysn':''

    }
    res = requests.post(url=aa, data=data,headers=headers).json()
    # print(res)
    # print(res["list"])

    for dic in res["list"]:
        id=dic["ID"]
        id_list.append(id)

print(id_list)
id_url='http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for li_id in id_list:
    id_data={
       'id':li_id
    }
    ret=requests.post(url=id_url,data=id_data,headers=headers).json()
    print(ret)

get爬取圖片寫入本地

import requests
import urllib
aa='http://d.hiphotos.baidu.com/album/pic/item/b58f8c5494eef01f8931cc7ae1fe9925bc317d6c.jpg?psign=8931cc7ae1fe9925bc315c6034a85edf8cb1cb1349545954'
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
res=requests.get(url=aa,headers=headers).content
with open("./aa.jpg","wb") as f:
    f.write(res)

urllib.request.urlretrieve(url=aa,filename="./11.jpg") #   和上面一樣

爬取登錄抽屜點贊(get+post)


 # 先查看首頁
# 提交用戶名和密碼
# 一般  :直接發送請求 獲取結果

 抽屜    套路:
       第一次登錄成功 任意訪問一個頁面  這時候已經把cookies返回了  在第二登錄要把cookies 帶上 經行授權
      要是再次登陸要把cookie帶上 在進行授權


import requests
# 1. 查看首頁
r1 = requests.get(
    url='https://dig.chouti.com/',
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
)


# 2. 提交用戶名和密碼
r2 = requests.post(
    url='https://dig.chouti.com/login',
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    },
    data={   #登錄發送 要把數據帶上
        'phone':'8617380117935',
        'password':'lv5555555',
        'oneMonth':1
    },
    cookies=r1.cookies.get_dict()
)
print(r2.text)

print(r2.cookies.get_dict() )   # 這個cokies 混淆我們


# 3. 點贊
r3 = requests.post(
    url='https://dig.chouti.com/link/vote?linksId=20435396',  #點贊的url
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    },
    cookies=r1.cookies.get_dict()  #   帶r1cookie
)
print(r3.text)
print(r1.cookies.get_dict(),11111111111111111111111)

2. 使用requuests 正則爬取數據

https://www.cnblogs.com/lovershowtime/p/11776549.html

正則爬取糗事百科圖片下載

# https://www.qiushibaike.com/

import re,os
import requests,urllib


url="https://www.qiushibaike.com/pic/page/%d/?s=5170552"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}

if not os.path.exists("./img"):
    os.mkdir("./img")


start=int(input("請輸入起始頁:"))
end=int(input("請輸入結束頁:"))

for page in  range(start,end+1):
    new_url=format(url%page) # https://www.qiushibaike.com/pic/page/1/?s=5170552
    print(new_url)

    page_text=requests.get(url=new_url,headers=headers).text
    img_url_list=re.findall('<div class="thumb">.*?<img src="(.*?)" alt=.*?></div>',page_text,re.S)
    for img_url in img_url_list:
        img_urls='https'+img_url
        imgname=img_urls.split("/")[-1]
        imgpath="img/"+imgname
        urllib.request.urlretrieve(url=img_url,filename=imgpath)
        print("下載成功了")


# re.findall(正則匹配的格式,匹配的對象)
# . 任意字符
# * 零個或者多個字符
# ? 零個或者多個字符
#
# re.S  匹配包括換行在內的所有字符

aa="http://img95.699pic.com/photo/50045/7601.jpg_wh300.jpg"
print(aa.split("/"))

# print(aa.split("/")[-1])  #  7601.jpg_wh300.jpg

3. 使用requuests BeautifulSoup爬取數據

https://www.cnblogs.com/lovershowtime/p/11771726.html

爬取古詩詞網內容寫入本地

import re,os
import requests,urllib
from bs4 import BeautifulSoup

url="http://www.shicimingju.com/book/sanguoyanyi.html"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}

pang_text=requests.get(url=url,headers=headers).text
sup=BeautifulSoup(pang_text,"lxml")
list_li=sup.select('.book-mulu>ul>li>a')

fp=open("aa.txt","w",encoding="utf-8")

for a in list_li:
    title=a.string
    print(title)
    urls_text='http://www.shicimingju.com'+a["href"]
    print(urls_text)   # http://www.shicimingju.com/book/nanbeishiyanyi/10.html

    pa_test=requests.get(url=urls_text,headers=headers).text
    sup = BeautifulSoup(pa_test, "lxml")
    cont=sup.find('div',class_='chapter_content').text
    fp.write(title+'\n'+cont)
    print(title)

fp.close()

爬取汽車之家新聞 和下載圖片到本地

import requests
from bs4 import BeautifulSoup  #  將html格式的字符串解析成對象 對象.fand   對象.find_all

response = requests.get("https://www.autohome.com.cn/news/")
response.encoding = 'gbk'

soup = BeautifulSoup(response.text,'html.parser')   # 解析
div = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'}) #  查找標簽 這個標簽里面包含了所有新聞
print(div)

li_list = div.find_all(name='li')

for li in li_list:
    title = li.find(name='h3')  # 找標題
    if not title:
        continue
    p = li.find(name='p')
    a = li.find(name='a')
    print(title.text)
    print(a.attrs.get('href'))  # 獲取屬性
    print(p.text,"1111111111111111111111111111111111111111111111")


    # 獲取圖片
    img = li.find(name='img')
    src = img.get('src')
    src = "https:" + src
    print(src,5555)

    # 再次發起請求，下載圖片
    file_name = src.rsplit('/',maxsplit=1)[1]
    ret = requests.get(src)
    with open(file_name,'wb') as f:
        f.write(ret.content)

爬取抽屜標題內容標題

import requests
from bs4 import BeautifulSoup

r1 = requests.get(
    url='https://dig.chouti.com/',
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
)

soup = BeautifulSoup(r1.text,'html.parser')

# 標簽對象
content_list = soup.find(name='div',attrs={"class":"link-con"})
# print(content_list)
# [標簽對象,標簽對象]


item_list = content_list.find_all(name='div',attrs={'class':'link-detail'})
for item in item_list:
    a = item.find(name='a',attrs={'class':'link-title link-statistics'})
    print(a.text.strip())
    print(a["href"])
    # print(a.text)

4. 使用requuests Xpath爬取數據

https://www.cnblogs.com/lovershowtime/p/11777009.html

使用xpath爬取二手房信息

import requests
from  lxml import etree


url="https://cd.58.com/ershoufang/?utm_source=sem-sales-baidu-pc&spm=82881519251.21430224112&utm_campaign=sell&utm_medium=cpc&showpjs=pc_fg"

headers = {
              'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
          }


f=open("ab.txt","w",encoding="utf-8")
ret=requests.get(url=url,headers=headers).text
tree=etree.HTML(ret)
list_le=tree.xpath("//ul[@class='house-list-wrap']/li")
print(list_le)


for el in list_le:
    title=el.xpath("./div[2]/h2/a/text()")[0]  # 當前第二個div下的h2 下的a的文本
    price = el.xpath("./div[3]//text()")   # 當前第三個div下的 所有的文本
    pi=''.join(price)
    f.write(title+":"+pi+"\n")

f.close()


# /html/body/div[5]/div[5]/div[1]/ul/li[1]/div[2]
#
# /html/body/div[5]/div[5]/div[1]/ul/li[1]/div[2]/h2

使用xpath爬取圖片下載在本地

# http://pic.netbian.com/4kmeinv/
import urllib
import requests,os
from  lxml import etree



url="http://pic.netbian.com/4kmeinv/"

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

if  not os.path.exists("./imgs"):
      os.mkdir("./imgs")

ret=requests.get(url=url,headers=headers)
# ret.encoding="utf-8"   一般解決方式
ret_li=etree.HTML(ret.text)
li_list=ret_li.xpath("//div[@class='slist']/ul/li")
for li in li_list:
    li_name=li.xpath("./a/b/text()")[0]
    # 處理亂碼
    li_img_name=li_name.encode('ISO-8859-1').decode("gbk")      # 萬能解決編碼的問題 哪里亂碼解決哪里 也可以使用全部

    img_url="http://http://pic.netbian.com/"+li.xpath("./a/img/@src")[0]
    
    
    
    img_path='./imgs/'+li_img_name+'.jpg'

    urllib.request.urlretrieve(url=img_url, filename=img_path)
    
    print( img_path,"下載完成")

# 爬取圖片加密 # 數據加密(反扒機制)

# base64 返回是二進制
import urllib
import requests, os
from lxml import etree
import base64
url = "http://jandan.net/ooxx"
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

ret = requests.get(url=url, headers=headers).text
print(ret)
ret_li=etree.HTML(ret)
li_list = ret_li.xpath("//span[@class='img_hash']/text()")
for img_hash in  li_list:
    img_url="http:"+base64.b64decode(img_hash).decode()   # 解析加密的圖片路徑   ()
    img_name=img_url.split("/")[-1]
    urllib.request.urlretrieve(url=img_url, filename=img_name)

爬取 下載簡歷模板 import urllib,random
import requests, os
from lxml import etree


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

url = "http://sc.chinaz.com/jianli/free_%d.html"     # 頁嗎

for page in  range(1,4):
    if page==1:
         new_url="http://sc.chinaz.com/jianli/free.html"  # 免費模板
    else:
        new_url=format(url%page)


    ret = requests.get(url= new_url, headers=headers)
    ret.encoding='utf-8'   #解決亂碼的問題
    tree=etree.HTML(ret.text)
    div_list=tree.xpath("//div[@id='container']/div")
    for div in div_list:
        det_url=div.xpath("./a/@href")[0]        #  定位連接中href屬性
        name=div.xpath("./a/img/@alt")[0]        # 定義到圖片的名字
        print(name)
        #
        dat_page=requests.get(url=det_url,headers=headers).text
        trees = etree.HTML(dat_page)
        dowloand_list=trees.xpath("//div[@class='clearfix mt20 downlist']/ul/li/a/@href")

        dow_url=random.choice(dowloand_list)
        data=requests.get(url=dow_url,headers=headers).content


        fileName=name+".rar"
        with open(fileName,"wb") as f:
            f.write(data)
            print(fileName,"下載成功了哈哈哈")





問題:往往在進行大量請求發送的時候 經常會報一個這樣的錯誤 HTTPconnrctionFool....with url

   原因:
        1 : 每次數據傳輸前客服端要和服務器端建立tcp連接 為了節省傳輸消耗的時間 默認是keep-alive 即連接一次
             然而如果遲遲不斷連接的化,則連接池滿后則無法產生新的連接對象 導致請求無法發送
        2. IP 被封
        3. 請求過於頻繁



   解決方法：
      設置請求頭中conntection的值設為close 表示每次請求成功后斷開連接
      更換ip
      每次請求之間使用sleep進行等待時間間隔

爬取城市數據

import urllib,random
import requests, os
from lxml import etree


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
url="https://www.aqistudy.cn/historydata/"
ret=requests.get(url=url,headers=headers).text
tree=etree.HTML(ret)

li_lit=tree.xpath("//div[@class='bottom']/ul/li | //div[@class='bottom']/ul/div[2]/li")    # Xpath表達式

for li in  li_lit:
    cont=li.xpath("./a/text()")[0]
    print(cont)

4. 代理IP使用(用戶代理和代理池)

用戶代理池

# # http://www.goubanjia.com/   全網代理IP        https://www.kuaidaili.com/  快代理
# # 反扒機制代理ip
# 設置請求的代理ip
# # 代理ip的類型必須和請求url的協議保存一至
import requests
 方式一: IP代理構建

url="https://www.baidu.com/s?wd=ip"
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
ret=requests.get(url=url,headers=headers,proxies={"https":'222.184.59.8:808'}).text
print(ret)
with open("./ip.html","w",encoding="utf-8") as f:
    f.write(ret)

方式二: IP代理構建實戰（urllib模塊） import urllib.request
ip='119.23.79.199:3128'
proxy=urllib.request.ProxyHandler({"http":ip})
openers=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(openers)
url="http://www.baidu.com"
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
print(len(data))
f=open("bb.html",'w',encoding="utf-8")
f.write(data)
f.close()

方式一:構建代理池 # ip代理池的構建 適用於代理ip穩定的情況下
import random
import urllib.request

# 代理池
pools=[
    "119.23.79.199:3128",
    "221.224.163.54:808",
    "210.26.64.44:3128",
    "27.191.234.69:9999",
]

def ip(pools):
   ips=random.choice(pools)

   proxy = urllib.request.ProxyHandler({"http": ips})
   openers = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
   urllib.request.install_opener(openers)

for i in  range(0,5):
    try:                           # 代理池ip不穩定需要異常捕獲
        ip(pools)
        url = "http://www.baidu.com"
        data = urllib.request.urlopen(url).read().decode("gbk", "ignore")
        print(len(data))
        f=open("ss.html","w")
        f.write(data)
    except Exception as err:
        print(err,"1111")

方式二: 代理池 Ip代理 接口調用法 這種代理適用於代理ip穩定


import random
import urllib.request
def ip():
                                                # 這是大象代理ip接口      http://daxiangdaili.com/api
   ips=urllib.request.urlopen("http://www.daxiangdaili.com/ip/?tid=559126871522587&num=2").read().decode("utf-8", "ignore")
   proxy = urllib.request.ProxyHandler({"http": ips})
   openers = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
   urllib.request.install_opener(openers)



for i in  range(0,5):
    try:                           # 代理池ip不穩定需要異常捕獲
        ip()
        url = "http://www.baidu.com"
        data = urllib.request.urlopen(url).read().decode("gbk", "ignore")
        print(len(data))
        f=open("ss.html","w")
        f.write(data)
    except Exception as err:
        print(err,"1111")

方式三: 代理池 Ip代理


from bs4 import BeautifulSoup
import requests
import random

# 2獲取網頁內容函數
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }


def getHTMLText(url,proxies):
    try:
        r = requests.get(url,proxies=proxies)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
    except:
        return 0
    else:
        return r.text

# 3.從代理ip網站獲取代理ip列表函數，並檢測可用性，返回ip列表

def get_ip_list(url):
    web_data = requests.get(url,headers)
    soup = BeautifulSoup(web_data.text, 'html')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
#檢測ip可用性，移除不可用ip：（這里其實總會出問題，你移除的ip可能只是暫時不能用，剩下的ip使用一次后可能之后也未必能用）
    for ip in ip_list:
        try:
          proxy_host = "https://" + ip
          proxy_temp = {"https": proxy_host}
          res = urllib.urlopen(url, proxies=proxy_temp).read()
        except Exception as e:
          ip_list.remove(ip)
          continue
    return ip_list
# 4.從ip池中隨機獲取ip列表

def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies



# 5.調用代理
if __name__ == '__main__':
    url = 'http://www.xicidaili.com/nn/'
    ip_list = get_ip_list(url)
    proxies = get_random_ip(ip_list)
    print(proxies)

5.使用模擬登錄爬取數據

人人網模擬登錄  抓包    驗證碼獲取(使用了雲打碼平台http://www.yundama.com/)    session使用

# http://www.yundama.com/   雲打碼
# Superme888888@outlook.com
# supreme9999
# @_XJQ1995110
# 17380117935

# # http://www.renren.com/SysHome.do

import requests,urllib
from  lxml import etree
session=requests.session() # 獲取session對象


import http.client, mimetypes, urllib, json, time, requests

######################################################################
class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text




def getCode(username,pwd,codePath,codeType):
    # 用戶名(普通用戶)
    username = username
    # 密碼
    password = pwd
    # 軟件ＩＤ，開發者分成必要參數。登錄開發者后台【我的軟件】獲得！
    appid =9406
    # 軟件密鑰，開發者分成必要參數。登錄開發者后台【我的軟件】獲得！
    appkey = '4b671243618fff6a87ebbe33446d09e3'
    # 圖片文件
    filename = codePath
    # 驗證碼類型，# 例：1004表示4位字母數字，不同類型收費不同。請准確填寫，否則影響識別率。在此查詢所有類型 http://www.yundama.com/price.html
    codetype = codeType
    # 超時時間，秒
    timeout = 80

    result=None
    # 檢查
    if (username == 'username'):
        print('請設置好相關參數再測試')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登陸雲打碼
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查詢余額
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 開始識別，圖片路徑，驗證碼類型ID，超時時間（秒），識別結果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
    return  result



# 模擬登錄

url="http://www.renren.com/"
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
ret=requests.get(url=url,headers=headers).text
terr=etree.HTML(ret)
code_img_url=terr.xpath("//*[@id='verifyPic_login']/@src")[0]    #  獲取驗證碼
urllib.request.urlretrieve(url=code_img_url,filename="code.jpg")




# 識別驗證碼中的數據值  顯示去識別
code_data=getCode("supreme9999","@_XJQ1995110","./code.jpg",2004)
print(code_data)




# 獲取登錄發送的數據
login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019100151870"
data={"email":"17380117935",
    "icode"    :code_data,
    "origURL":"http://www.renren.com/home",
    "domain":"renren.com",
    "key_id    ":1,
    "captcha_type":"web_login",
    "password":"7f68692e5e69afa1ba418b799ec63a0a",
    "rkey":"7f68692e5e69afa1ba418b799ec63a0a",
    "f":"http%3A%2F%2Fwww.renren.com%2F972764841%2Fprofile",
}
headers = {
              'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
          }



# requests登錄 請求發送成功后cookies不會存儲到requests中
# get_cont=requests.post(url=login_url,data=data,headers=headers)

# 該次請求登錄會 產生的cookies 會被自動存儲到session對象中  (注意只有登錄成功后才會產生session)
session.post(url=login_url,data=data,headers=headers)


urls="http://www.renren.com/972764841/profile"  #登錄成功后跳轉到主頁
pag_text=session.get(url=urls,headers=headers).text

with open("ren.html","w",encoding="utf-8") as f:
    f.write(pag_text)

模擬登錄古詩文網

# http://www.yundama.com/   雲打碼
# Superme888888@outlook.com
# supreme9999
# @_XJQ1995110
# 17380117935

import http.client, mimetypes, urllib, json, time

import requests
from  lxml import etree
# 獲取session對象


# 獲取古詩文網
######################################################################
class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb');
        res = requests.post(url, files=files, data=fields)
        return res.text




def getCode(username,pwd,codePath,codeType):
    # 用戶名(普通用戶)
    username = username
    # 密碼
    password = pwd
    # 軟件ＩＤ，開發者分成必要參數。登錄開發者后台【我的軟件】獲得！
    appid =9406
    # 軟件密鑰，開發者分成必要參數。登錄開發者后台【我的軟件】獲得！
    appkey = '4b671243618fff6a87ebbe33446d09e3'
    # 圖片文件
    filename = codePath
    # 驗證碼類型，# 例：1004表示4位字母數字，不同類型收費不同。請准確填寫，否則影響識別率。在此查詢所有類型 http://www.yundama.com/price.html
    codetype = codeType
    # 超時時間，秒
    timeout = 80

    result=None
    # 檢查
    if (username == 'username'):
        print('請設置好相關參數再測試')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)

        # 登陸雲打碼
        uid = yundama.login();
        print('uid: %s' % uid)

        # 查詢余額
        balance = yundama.balance();
        print('balance: %s' % balance)

        # 開始識別，圖片路徑，驗證碼類型ID，超時時間（秒），識別結果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
    return  result




# 獲取驗證碼
s =requests.Session()
url="https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx"
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
ret=requests.get(url=url,headers=headers).text
terr=etree.HTML(ret)

img_src='https://so.gushiwen.org'+terr.xpath("//*[@id='imgCode']/@src")[0]
print(img_src)


img_data=s.get(url=img_src,headers=headers).content

with open("./cc.jpg","wb") as f:
    f.write(img_data)
img_text=getCode("Superme888888@outlook.com"," @_XJQ1995110","./cc.jpg",1004)

print(img_text)




# 模擬登錄
url="https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx "
date={ "__VIEWSTATE":"DmBrtFoRGX4MZ4I+urA1bNT3UpnZRkyA7O/9XO1azxff3G35mKDbCmAunAB+TZAZF6HpQunWGe82fhPXwgs/DVfRY9h/LBljRx97fxgOE7+AkMu12yNZsyIZs1I=",   # 這個可以去頁面獲取它是隱藏了的
        "__VIEWSTATEGENERATOR":"C93BE1AE",
        "from":"http://so.gushiwen.org/user/collect.aspx",
        "email":"Superme888888@outlook.com",
        "pwd":"@_XJQ1995110",
        "code":"f0r5",
        "denglu":"登錄",
}


tesrs=s.post(url=url,headers=headers,data=date).text


with open("./aa.html","w",encoding="utf-8") as f:
     f.write(tesrs)

6.爬取圖片(懶加載)

# 在網頁源碼中在img標簽中首先會使用一個 偽類屬性(通常會使用src2, originai...)
# 去存放正真的圖片連接 並非直接放在src屬性中 當圖片出現在可視化區域中會動態將偽屬性 替換成src屬性 完成懶加載

url="http://sc.chinaz.com/tupian/xixirenti.html"
import requests
from  lxml import etree
headers = {
              'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
          }


ret=requests.get(url=url,headers=headers)
ret.encoding="utf-8"
tests=ret.text
tree=etree.HTML(tests)

div_list=tree.xpath("//div[@id='container']/div")

for div in div_list:
    img_url=div.xpath(".//img/@src")
    print(img_url)
    img_name= div.xpath(".//img/@alt")
    print(img_name)
# 爬取到的圖片為空

url="http://sc.chinaz.com/tupian/xixirenti.html"
import requests
from  lxml import etree
headers = {
              'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
          }

ret=requests.get(url=url,headers=headers)
ret.encoding="utf-8"
tests=ret.text

tree=etree.HTML(tests)
div_list=tree.xpath("//div[@id='container']/div")
for div in div_list:
    a_url = div.xpath(".//a/@href")
    print(a_url)
    img_url=div.xpath(".//img/@src2")
    print(img_url)
    img_name= div.xpath(".//img/@alt")
    print(img_name)


import requests
import time
from lxml import etree  # xpath解析

# 定義站長素材圖片的存儲路徑, 文件夾需要先創建好
IMAGE_PATH = 'img/'


def spider_image(page):
    if page == 1:
        url = 'http://sc.chinaz.com/tupian/'
    else:
        url = 'http://sc.chinaz.com/tupian/index_%s.html' % page
    # 定制請求頭信息
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      "Chrome/74.0.3729.108 Safari/537.36",
    }

    # 對頁面發起請求
    response = requests.get(url=url, headers=headers)
    # 需要對響應的頁面進行編碼, 否則拿到的會是亂碼的數據
    response.encoding = 'utf-8'

    # 解析數據,獲取圖片的img標簽
    tree = etree.HTML(response.text)
    img_list = tree.xpath('//div[@id="container"]/div/div/a/img')

    # 循環遍歷所有img標簽
    for img in img_list:
        # 拿取圖片標題
        title = img.xpath('./@alt')[0]

        """
        此處拿的src是懶加載時的圖片路徑,當使用get請求頁面拿取到的時頁面的源碼,此時
        還沒有動態加載頁面,所以src還是懶加載的,只需要拿取懶加載的src2即可
        """
        src = img.xpath('./@src2')[0]

        # 訪問圖片頁面
        res = requests.get(url=src, headers=headers)
        # 圖片必須以二進制流來寫入
        with open(IMAGE_PATH + '%s.jpg' % title, 'wb') as f:
            f.write(res.content)


if __name__ == '__main__':
    # 循環爬取指定頁碼的圖片
    start_time = time.time()
    for i in range(1, 3):
        spider_image(i)
        time.sleep(2)
    end_time = time.time()
    print("總耗時:%s" % (end_time - start_time))

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 什么是Mybatis？最全的Mybatis知識點整合！ Mysql知識點整理 springmvc知識點整理前端知識點整理 ORACLE知識點整理之一 Python知識點整理 vue知識點整理鏈表知識點整理 JS知識點整理 Django知識點整理

五. 爬蟲 案例(前面知識點整合)

一 .案例

1.使用requuests(get,post) 和urllib 爬取數據

2. 使用requuests 正則爬取數據

3. 使用requuests BeautifulSoup爬取數據

4. 使用requuests Xpath爬取數據

4. 代理IP使用(用戶代理 和代理池)

5.使用模擬登錄 爬取數據

6.爬取圖片(懶加載)

免責聲明！

五. 爬蟲案例(前面知識點整合)

4. 代理IP使用(用戶代理和代理池)

5.使用模擬登錄爬取數據