python爬取照片（失敗）

本文轉載自查看原文 2020-10-11 12:55 623 python

　　python爬取漫畫（失敗）

一：獲取每一章的url網址以及名字：

import re
import urllib
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import time

main_web="http://www.kuman55.com"

pic=[]　　#儲存照片
findlist=[]
strname=[]　　#儲存每章漫畫的名字
addr=[]　　#儲存每個網站的地址
findTitle=re.compile(r'<a href="(.*)" rel="nofollow">(.*)<span>')
findSource=re.compile(r'<a href=".*')

def collect(web):
    response=urllib.request.urlopen(web)
    time.sleep(2000)
    bs=BeautifulSoup(response.read(),"html")
    tag=bs.find(attrs={'class':'view-win-list detail-list-select'})
    for item in tag.find_all(name='a',rel='nofollow'):
        # 儲存地址：獲取a標簽下的href元素,注意獲取到了整個標簽，那么只需要用數組中加引號再加標簽名字就可以獲取到內容
        addr.append(str(main_web+item['href']))
        item=str(item)
        findlist.append(item)

    save_info()

def save_info():
    for item in findlist:
        #可以了，記得要和原先的格式要一樣,儲存名字
        item=re.sub(r'<a href=".*" rel="nofollow">',"",item)
        item=re.sub(r'<span>（P）</span></a>',"",item)
        strname.append(item)

    for i in len(addr):
        save_picture(strname[i],addr[i])

def save_picture(name,address):
    print("")




# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    collect("http://www.kuman55.com/mulu/15762/1-1.html")

# See PyCharm help at https://www.jetbrains.com/help/pycharm/

二：儲存漫畫圖片（這里失敗了，因為該網站使用Ajax動態隱藏掉了圖片div標簽，而且使用Data URI加密，目前還沒有解碼）

import re
import urllib
import urllib.request
import time
import requests
#進行Data URI編碼所用的包
from base64 import b64decode
from bs4 import BeautifulSoup

headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 Edg/85.0.564.68'
    }

findsrc=re.compile(r'src="https://p.pstatp.com/origin(.*)"')
img=[]
strings=[]

def save_picture(name,address):
    # 導入請求庫l
    html=requests.get(address)
    print(html.text)
    response=urllib.request.Request(url=address,headers=headers,method="POST")
    time.sleep(2)
    res=urllib.request.urlopen(response)
    bs=BeautifulSoup(res.read(),"html")
    # 使用BeautifulSoup來獲取對應的標簽的屬性值:使用點get，里面再加一個屬性值的字符串就好了，ok
    strings=str(bs.find(name="img",attrs={"class":"comicimg"}).get('src'))
    print(strings)

    #解碼bs64格式的Data URI
    '''
        1.pip導入base64的包,這個不用pip下載，pycharm自帶
        2.將src的前面編碼方式和后面的碼分離出來，兩者使用逗號相連，而且只有一個
        3.使用b64decode編碼器編譯，並儲存到字符節數組data里面
        4.將其字符集寫入文件當中，就會生成所需要的文件(為什么導出來的圖片是白板？？)
    '''
    head1,encode=strings.split(',',1)
    data=b64decode(encode)
    with open("image_src.png","wb") as f:
        f.write(data)
        f.close()

    '''另外一種方法：
        1.導入base64下面的decodestring包
        2.打開圖片文件
        3.將碼先進行編碼，然后轉換成字符串
    '''





if __name__ == '__main__':
    save_picture("照一","http://www.kuman55.com/15762/1171599.html")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python基礎之爬取小說 python批量爬取文檔 python爬取簡單網頁 python 爬取知乎圖片 python爬蟲（爬取視頻） python爬蟲之爬取小說（一） python爬蟲之爬取音頻 [Python]爬取mzitu網站 Python爬取地圖瓦片用python爬取疫情數據