程序簡介
百度圖片爬蟲的封裝接口2018年實現的,現在還能用...不錯,謝謝百度的不封之恩,先將其貢獻給所有熱愛技術的開發者
輸入:關鍵詞、下載數量、重定尺寸(可省)
輸出:自動創建文件夾下載對應數量的百度圖片,圖片由md5命令
程序/數據集下載
代碼分析
導入模塊
import numpy as np
import hashlib
import requests
import json
import cv2
import os
evalMd5函數用來計算圖片md5,好進行命名和過濾相同圖片
def evalMd5(sentence,charset='utf8'):
'''
計算一段字符串的md5
:param sentence: 字符串
:param charset: 字符集
:return: md5值
'''
#將字符串編碼成bytes
if type(sentence) != bytes:
sentence = sentence.encode(charset)
md5 = hashlib.md5(sentence).hexdigest()
return md5
resizeImg函數用來重定圖片尺寸
def resizeImg(oldPath,size,newPath):
'''
重定圖片尺寸
:param oldPath: 圖片路徑
:param size: 重定大小
:param newPath: 圖片保存路徑
:return: None
'''
oldPath = oldPath.replace('\\','/')
newPath = newPath.replace('\\','/')
oldImg = cv2.imdecode(np.fromfile(oldPath,dtype=np.uint8),-1)
try:
newImg = cv2.resize(oldImg,size,) #為圖片重新指定尺寸
cv2.imwrite(newPath,newImg)
cv2.imencode('.'+newPath.split('.')[-1],newImg)[1].tofile(newPath)
except:
#圖片格式不對發生錯誤,刪除
os.remove(oldPath)
核心函數download會調用上面的函數進行批量圖片下載
def download(keyWord,imgNumber,imgSize=None):
'''
下載圖片到關鍵詞文件夾
:param keyWord: 關鍵詞
:param imgNumber: 圖片數量
:param imgSize: 圖片重定大小
:return: None
'''
#創建關鍵詞文件夾
dirname = keyWord
if not os.path.exists(dirname):
os.mkdir(dirname)
#開始爬圖片
url = 'https://image.baidu.com/search/acjson'#圖片網址
same = 0#重復下載數
error = 0#錯誤數
passNum = 0#無鏈接數
for i in range(30,30*10000+30,30):
param = {
'tn': 'resultjson_com','ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyWord,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyWord,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1488942260214': ''
}
#所有圖片地址列表
data = requests.get(url,params=param).text.replace('\\','\\\\')
try:
data = json.loads(data)['data']
except:
#json數據可能不合法,直接跳過
error += 1
if error >=20:
return None
continue
for item in data:
imgUrl = item.get("middleURL")#圖片地址
if passNum>=20:
return None
if imgUrl is None:
passNum+=1
continue
suffix = imgUrl.split('.')[-1]#圖片后綴
imgContent = requests.get(imgUrl).content#圖片內容
imgMd5 = evalMd5(imgContent)#圖片md5
imgPath = os.path.join(dirname,'%s.%s'%(imgMd5,suffix))#圖片路徑
oldFinish = len(os.listdir(dirname))
open(imgPath, 'wb').write(imgContent)#寫入
#重定尺寸
if imgSize:
resizeImg(imgPath,imgSize,imgPath)
newFinish = len(os.listdir(dirname))
print('key:%s goal:%d finish:%d'%(keyWord,imgNumber,newFinish))
#圖片數達標,退出
if newFinish >= imgNumber:
return None
#重復下載圖片達到100次,說明已經下載完所有圖片,退出
if newFinish == oldFinish:
same+=1
if same >= 20:
return
來測試一下看看效果吧~
imgNumber = 10
keys = ['電子琴','蘋果']
imgSize = None
for keyWord in keys:
download(keyWord,imgNumber,imgSize)
key:電子琴 goal:10 finish:1
key:電子琴 goal:10 finish:2
key:電子琴 goal:10 finish:3
key:電子琴 goal:10 finish:4
key:電子琴 goal:10 finish:5
key:電子琴 goal:10 finish:6
key:電子琴 goal:10 finish:7
key:電子琴 goal:10 finish:8
key:電子琴 goal:10 finish:9
key:電子琴 goal:10 finish:10
key:蘋果 goal:10 finish:1
key:蘋果 goal:10 finish:2
key:蘋果 goal:10 finish:3
key:蘋果 goal:10 finish:4
key:蘋果 goal:10 finish:5
key:蘋果 goal:10 finish:6
key:蘋果 goal:10 finish:7
key:蘋果 goal:10 finish:8
key:蘋果 goal:10 finish:9
key:蘋果 goal:10 finish:10