爬取網站:https://www.169tp.com/xingganmeinv
該網站美眉圖片有數百頁,每頁24張,共上萬張圖片,全部爬取下來
1 import urllib.request 2 import re 3 import os 4 from bs4 import BeautifulSoup 5 6 page_flag = 0 7 base_url = "https://www.169tp.com/xingganmeinv/" 8 first_url ="https://www.169tp.com/xingganmeinv/list_1_1.html" 9 Imgnums = 0 10 11 def get_html(url): 12 response = urllib.request.urlopen(url) 13 html = response.read().decode('gb18030') 14 return html 15 16 def get_Imgurl_list(html): 17 img_urllist = re.findall('src=["\']{1}(.+?\.jpg)["\']{1}', html) 18 return img_urllist 19 20 21 def Download(img_urllist,page_flag,final_path,Imgnums): 22 num = 1 23 for imgurl in img_urllist: 24 imgname = "{}{}{}{}.jpg".format(final_path,page_flag,'_',num) 25 urllib.request.urlretrieve(imgurl,imgname) 26 print("已經爬取圖片名:",imgname) 27 Imgnums += 1 28 num += 1 29 30 def makedir(path): 31 path = path.strip() 32 isExists = os.path.exists(path) 33 if not isExists: 34 print("創建了路徑為 ",path," 的文件夾") 35 os.makedirs(path) 36 return True 37 else: 38 print("路徑為 ",path," 的文件夾已經存在") 39 return False 40 41 42 filepath = input("請輸入保持圖片的文件夾路徑:") 43 print(filepath) 44 name = input("請輸入保存圖片的文件夾名:") 45 print(name) 46 finalpath = filepath + name 47 makedir(finalpath) 48 finalpath += '\\' 49 print(f"圖片保存路徑: {finalpath}") 50 51 Download(get_Imgurl_list(first_url),page_flag,finalpath,Imgnums) 52 mysoup = BeautifulSoup(get_html(first_url),'html.parser') 53 next_page = mysoup.find('div',attrs = {'class':'page'}).find('li',text = '下一頁').find('a') 54 while next_page: 55 new_url = base_url + next_page['href'] 56 page_flag += 1 57 Download(get_Imgurl_list(get_html(new_url)),page_flag,finalpath,Imgnums) 58 mysoup = BeautifulSoup(get_html(new_url),'html.parser') 59 next_page = mysoup.find('div',attrs = {'class':'page'}).find('li',text = '下一頁').find('a') 60 print(f"下載完成,共下載了 {Imgnums} 張圖片!")
運行截圖:
圖片名命名規則:存儲路徑+頁碼+下划線+圖片號+.jpg
圖片文件夾截圖: