python爬蟲(爬取圖片)


python爬蟲爬圖片

第一步

載入爬蟲模塊

from requests_html import HTMLSession            #載入爬蟲模塊

第二步

創建session對象

from requests_html import HTMLSession            #載入爬蟲模塊
session =HTMLSession() #創建完畢

第三步

獲得發現百度圖片搜索規律並發起請求並匹配到圖片的url

http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=我們搜圖片的關鍵字

from requests_html import HTMLSession            #載入爬蟲模塊
session =HTMLSession() #創建完畢
#拿二傻子為例
response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
#獲取我們圖片的url的正則匹配格式
img_url_regex = '"thumbURL":"{}",'
#解析並獲取圖片url_list
img_url_list = response.html.search_all(img_url_regex)

第四步

訪問圖片url並且保存下來

from requests_html import HTMLSession            #載入爬蟲模塊
session =HTMLSession() #創建完畢
#拿二傻子為例
response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
#獲取我們圖片的url的正則匹配格式
img_url_regex = '"thumbURL":"{}",'
#解析並獲取圖片url_list
img_url_list = response.html.search_all(img_url_regex)

mun=0
for url in img_url_list:
    mun+=1
    #訪問圖片鏈接
    response= session.get(url[0])
    #保存二進制並保存至本地
    with open(f'第{mun}張.jpg','wb') as fw:
        fw.write(response.content)

第五步

類的封裝

from requests_html import HTMLSession    

class BaiDuImg:
    session = HTMLSession()
    img_url_regex = '"thumbURL":"{}",'
    url=''
    img_url_list =[]
    
    def get_search(self):
        search=input('請輸入你要搜索的圖片')
        self.url=f'http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word={search}'
        
    def get_img_url_list(self):
        response=self.session.get(self.url)
        self.img_url_list = response.html.search_all(self.img_url_regex)
        
    def save_img(self):
        mun = 0
        for url in self.img_url_list:
            mun += 1
            # 訪問圖片鏈接
            response = self.session.get(url[0])
            # 保存二進制並保存至本地
            with open(f'第{mun}張.jpg', 'wb') as fw:
                fw.write(response.content)
    
    def run(self):
        self.get_search()
        self.get_img_url_list()
        self.save_img()
        
if __name__ == '__main__':
    baidu=BaiDuImg()
    baidu.run()

后來有個研一的小姐姐說要把全部爬完那就改改

from requests_html import HTMLSession

class BaiDuImg:
    session = HTMLSession()
    img_url_regex = '"thumbURL":"{}",'
    url = ''
    img_url_list = []

    def get_search(self):
        search = input('請輸入你要搜索的圖片')
        #有點點偷懶參數沒有好好分析全,只對關鍵參數處理
        self.url = f'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={search}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word={search}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&rn=30&gsm='
	
    def get_img_url_list(self):
        '&pn=30000'
        pn = 0
        try:
            while True:  #由於百度限制只能抓取450張,嗯可能能獲取480張,我懶沒接着分析了,如果真的需要私聊我我可以寫全
                res = self.session.get(f'{self.url}&pn={pn}')
                print(res.json()['bdIsClustered'])
                if  res.json()['bdIsClustered']=='2':
                    break
                else:
                    pn+=30
                    for dic in res.json()['data']:
                        img_url = dic.get('thumbURL')
                        if img_url:
                            self.img_url_list.append(img_url)
        except Exception as e:
            pass



    def save_img(self):
        mun = 0
        for url in self.img_url_list:
            mun += 1
            # 訪問圖片鏈接
            response = self.session.get(url)
            # 保存二進制並保存至本地
            with open(f'第{mun}張.jpg', 'wb') as fw:
                fw.write(response.content)
                print(f'第{mun}張保存本地完畢')

    def run(self):
        self.get_search()
        self.get_img_url_list()
        print(len(self.img_url_list))
        self.save_img()


if __name__ == '__main__':
    baidu = BaiDuImg()
    baidu.run()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM