邊學邊寫代碼,記錄下來。這段代碼用於批量抓取主站下所有子網頁中符合特定尺寸要求的的圖片文件,支持中斷。
原理很簡單:使用BeautifulSoup4分析網頁,獲取網頁<a/>和<img/>元素,對<a/>集合反復彈棧入棧,對<img/>集合進行篩選下載。
具體代碼如下:import os
import sys import time import urllib.request from urllib.parse import urljoin,urlparse from bs4 import BeautifulSoup from threading import Thread ''' class Download(Thread): #多線程下載代碼1. 為每一個圖片分配一個下載線程 def __init__(self,url,filepath): Thread.__init__(self) self.url = url self.filepath = filepath def run(self): length = 0 try: opener = urllib.request.build_opener() opener.addheaders = [('User-agent','Mozilla/5.0')] urlhandle = opener.open(self.url,timeout = 30) urlhead = urlhandle.info() if 'Content-Length' in urlhead: length = int(urlhead['Content-Length']) data = urlhandle.read(10*1024) while data: with open(self.filepath,'ab+') as wf: wf.write(data) data = urlhandle.read(10*1024) except Exception as ex: print(self.url | '\n' + '× ' + str(ex)) try: os.remove(self.filepath) with open('/home/maple/Desktop/bad','a') as badFile: #超時未能完成下載則刪除文件並將圖片url記錄到未下載鏈接列表中 badFile.write(self.url+'\n') except: pass ''' def maple(root): tasks = [] #多線程集合 urls = [root] #待分析的網頁鏈接 urld = [] #已分析並完成圖片下載的網頁鏈接 if os.path.exists('tmpUrls'): #讀取本地待分析和已分析網頁鏈接數據 with open('tmpUrls','r') as urlsFile: urls = urlsFile.readlines() for url in urls: if url[0] == '' or url[0] == ' ': urls.remove(url) urls = [line[:-1] for line in urls] if os.path.exists('tmpUrld'): with open('tmpUrld','r') as urldFile: urld = urldFile.readlines() for url in urld: if url[0] == '' or url[0] == ' ': urld.remove(url) urld = [line[:-1] for line in urld] try: times =3 #設置網頁讀取失敗后重試的次數 while urls: curl = urls.pop() urld.append(curl) print('=================== Current Page: '+curl+' =======================') try: response = urllib.request.urlopen(curl,timeout = 5) html = response.read() data = html.decode('utf8') soup = BeautifulSoup(data) #使用BeautifulSoup獲取網頁元素集 except Exception as ex: #讀取網頁失敗,重試 print(ex) if times > 0: urls.append(curl) urld.remove(curl) times -= 1 else:
if curl in urld:
urld.remove(curl) times = 3 continue path = '/home/maple/Desktop/images/' count = 1 for list in soup.find_all('img'): #獲取網頁中所有圖片鏈接 width = 0 height = 0 dict = list.attrs if "src" in dict: image = dict['src'] img = image[image.rfind('.'):] if "alt" in dict: #該站點圖片鏈接中提供的圖片名屬性,不同站點給出的屬性可能不同甚至不一定給出圖片名屬性 fname = dict['alt'] filepath=os.path.join(path,fname+img) else: filepath = os.path.join(path,str(count)+img) count +=1 if "width" in dict: #獲取站點圖片鏈接中提供的圖片尺寸屬性,width和height屬性不一定給出 width = int(dict['width']) if "height" in dict: height = int(dict['height']) num=1 while os.path.exists(filepath): #如獲取的圖片名與本地圖片重名則自動按序重命名 fname,fext=os.path.splitext(filepath) if '('+str(num-1)+')'+fext in filepath: filepath = filepath.replace('('+str(num-1)+')'+fext,'('+str(num)+')'+fext) else: fname += '('+str(num)+')' filepath = fname+fext num +=1 for i in range(0,3): #圖片下載失敗后重試(如使用多線程部分的代碼則無此循環) try: if (width == 0 or width >= 250) or (height ==0 or height >= 350): length = 0 image_handle = urllib.request.urlopen(dict['src'],timeout = 5+i*10) #每次重試的超時時間依次遞增 image_head = image_handle.info() if 'Content-Length' in image_head: #獲取圖片實際大小 length = int(image_head['Content-Length']) print(dict['src']+' ==== SIZE:{}*{} -- {}KB'.format(width,height,length/1000)) if length > 20*1000: #只下載超過一定大小的圖片,避免下載網頁中的圖標或者鏈接圖 with open(filepath, 'wb') as file: image_data = image_handle.read() file.write(image_data) print('√') break ''' task = Download(dict['src'],filepath) #多線程下載代碼2.為圖片資源分配下載線程 task.setDaemon( True ) #將線程置為后台線程 task.start() tasks.append(task) #啟動線程並將線程加入線程集合中 ''' except Exception as ex: if i < 2: continue else: #重試3次后依然下載失敗則將圖片url記錄到未下載列表中 print('× '+str(ex)) try: os.remove(filepath) with open('/home/maple/Desktop/bad','a') as badFile: badFile.write(dict['src']+'\n') except: pass continue ''' if len(tasks) >= 10: while len([task for task in tasks if task.isAlive()]): time.sleep(2) tasks = [] ''' for a in soup.find_all('a'): #獲取當前頁面中所有的鏈接地址,未分析的網頁鏈接入棧 dict = a.attrs if 'href' in dict: url = dict['href'] if urlparse(url)[1]: if urlparse(url)[1] == urlparse(curl)[1]: pass else: url = urljoin(curl,url) if url not in urls and url not in urld: urls.append(url) except KeyboardInterrupt as kbi: #鍵盤終端,按下<C-c>終止程序,將已分析和未分析鏈接地址記錄到本地 with open('tmpUrls','w') as urlsFile: tmpList = [line + '\n' for line in urls] urlsFile.writelines(tmpList) with open('tmpUrld','w') as urldFile: tmpList = [line + '\n' for line in urld] urldFile.writelines(tmpList) if __name__ == '__main__': print(""" +++++++++++++++++++++++ version: python3.4 +++++++++++++++++=++++ """) url = 'http://www.msnzx.com/' #示例站點(子頁和圖片太多,運行完成需要很長時間) maple(url)
這段代碼某些細節部分是專門針對 http://www.msnzx.com/ 這個站點的,下載其他站點數據僅需要微調一下就行了。其中分析網頁直接使用了強大的第三方模塊BeautifulSoup4,方便快捷。下載圖片部分的實方式實在太多,上述代碼中包含了2種下載方式:
1、直接使用url.request讀寫流一次性下載,下載任意文件時程序都是阻塞的。這種方式適合下載size較小的圖片。圖片要么完全下載,要么完全不下載(得到的本地文件size = 0),網絡條件不佳的時候可以捕獲超時異常記錄未成功下載的圖片url。
2、以多線程的方式下載,為每個圖片資源分配一個下載線程。上述程序的注釋部分即是多線程下載代碼。這種方式下載迅速,就算網絡不佳,也能下載到圖片的部分內容。
另外還有很多下載方式,如單獨調用其他模塊(如urllib.request中的urlretrieve,之前文章中實現的文件多線程下載模塊download)或者系統工具如wget,curl等。這種直接調用的方式能夠為每一個圖片分配多線程進行下載。實現方式也最簡單。