python爬蟲爬圖片
第一步
載入爬蟲模塊
from requests_html import HTMLSession #載入爬蟲模塊
第二步
創建session對象
from requests_html import HTMLSession #載入爬蟲模塊
session =HTMLSession() #創建完畢
第三步
獲得發現百度圖片搜索規律並發起請求並匹配到圖片的url
http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=我們搜圖片的關鍵字
from requests_html import HTMLSession #載入爬蟲模塊
session =HTMLSession() #創建完畢
#拿二傻子為例
response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
#獲取我們圖片的url的正則匹配格式
img_url_regex = '"thumbURL":"{}",'
#解析並獲取圖片url_list
img_url_list = response.html.search_all(img_url_regex)
第四步
訪問圖片url並且保存下來
from requests_html import HTMLSession #載入爬蟲模塊
session =HTMLSession() #創建完畢
#拿二傻子為例
response = session.get('http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word=二傻子')
#獲取我們圖片的url的正則匹配格式
img_url_regex = '"thumbURL":"{}",'
#解析並獲取圖片url_list
img_url_list = response.html.search_all(img_url_regex)
mun=0
for url in img_url_list:
mun+=1
#訪問圖片鏈接
response= session.get(url[0])
#保存二進制並保存至本地
with open(f'第{mun}張.jpg','wb') as fw:
fw.write(response.content)
第五步
類的封裝
from requests_html import HTMLSession
class BaiDuImg:
session = HTMLSession()
img_url_regex = '"thumbURL":"{}",'
url=''
img_url_list =[]
def get_search(self):
search=input('請輸入你要搜索的圖片')
self.url=f'http://image.baidu.com/search/index?tn=baiduimage&fm=result&ie=utf-8&word={search}'
def get_img_url_list(self):
response=self.session.get(self.url)
self.img_url_list = response.html.search_all(self.img_url_regex)
def save_img(self):
mun = 0
for url in self.img_url_list:
mun += 1
# 訪問圖片鏈接
response = self.session.get(url[0])
# 保存二進制並保存至本地
with open(f'第{mun}張.jpg', 'wb') as fw:
fw.write(response.content)
def run(self):
self.get_search()
self.get_img_url_list()
self.save_img()
if __name__ == '__main__':
baidu=BaiDuImg()
baidu.run()
后來有個研一的小姐姐說要把全部爬完那就改改
from requests_html import HTMLSession
class BaiDuImg:
session = HTMLSession()
img_url_regex = '"thumbURL":"{}",'
url = ''
img_url_list = []
def get_search(self):
search = input('請輸入你要搜索的圖片')
#有點點偷懶參數沒有好好分析全,只對關鍵參數處理
self.url = f'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={search}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&word={search}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&rn=30&gsm='
def get_img_url_list(self):
'&pn=30000'
pn = 0
try:
while True: #由於百度限制只能抓取450張,嗯可能能獲取480張,我懶沒接着分析了,如果真的需要私聊我我可以寫全
res = self.session.get(f'{self.url}&pn={pn}')
print(res.json()['bdIsClustered'])
if res.json()['bdIsClustered']=='2':
break
else:
pn+=30
for dic in res.json()['data']:
img_url = dic.get('thumbURL')
if img_url:
self.img_url_list.append(img_url)
except Exception as e:
pass
def save_img(self):
mun = 0
for url in self.img_url_list:
mun += 1
# 訪問圖片鏈接
response = self.session.get(url)
# 保存二進制並保存至本地
with open(f'第{mun}張.jpg', 'wb') as fw:
fw.write(response.content)
print(f'第{mun}張保存本地完畢')
def run(self):
self.get_search()
self.get_img_url_list()
print(len(self.img_url_list))
self.save_img()
if __name__ == '__main__':
baidu = BaiDuImg()
baidu.run()