爬蟲之 百度圖片
http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=風景
import re
import os
import time
import requests
if not os.path.exists('百度圖片'):
os.mkdir('百度圖片')
# 獲取所有圖片
response = requests.get(
'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=風景')
data = response.text
img_desc_dics = re.findall("app.setData(\('imgData.*?\));", data, re.S)[0]
img_desc_dics = eval(str(img_desc_dics))
# 獲取所有圖片的數據
img_datas = img_desc_dics[1]['data']
count = 0
for img_data in img_datas:
# 獲取搜索圖片的參數
os_ = img_data.get('os')
cs_ = img_data.get('cs')
if os_ and cs_:
# 獲取搜索圖片的信息
img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
img_search_response = requests.get(img_search_url)
img_search_data = img_search_response.text
# 獲取圖片信息
img_url = re.findall('''\('firstSc'\);" src="(.*?)"''', img_search_data)[0]
img_name = img_url.split('/')[-1]
img_name = os.path.join('百度圖片', img_name) # 拼接出圖片的地址,如 百度圖片/3822951_144045377000_2.jpg
# 保存圖片
img_response = requests.get(img_url)
img_data = img_response.content
fw = open(img_name, 'wb')
fw.write(img_data)
fw.flush()
# 提示
count += 1
print(f'{img_name}保存成功,成功保存{count}張')
# 防止百度禁ip,慢一點
time.sleep(0.01)