爬取网站:https://www.169tp.com/xingganmeinv
该网站美眉图片有数百页,每页24张,共上万张图片,全部爬取下来
1 import urllib.request 2 import re 3 import os 4 from bs4 import BeautifulSoup 5 6 page_flag = 0 7 base_url = "https://www.169tp.com/xingganmeinv/" 8 first_url ="https://www.169tp.com/xingganmeinv/list_1_1.html" 9 Imgnums = 0 10 11 def get_html(url): 12 response = urllib.request.urlopen(url) 13 html = response.read().decode('gb18030') 14 return html 15 16 def get_Imgurl_list(html): 17 img_urllist = re.findall('src=["\']{1}(.+?\.jpg)["\']{1}', html) 18 return img_urllist 19 20 21 def Download(img_urllist,page_flag,final_path,Imgnums): 22 num = 1 23 for imgurl in img_urllist: 24 imgname = "{}{}{}{}.jpg".format(final_path,page_flag,'_',num) 25 urllib.request.urlretrieve(imgurl,imgname) 26 print("已经爬取图片名:",imgname) 27 Imgnums += 1 28 num += 1 29 30 def makedir(path): 31 path = path.strip() 32 isExists = os.path.exists(path) 33 if not isExists: 34 print("创建了路径为 ",path," 的文件夹") 35 os.makedirs(path) 36 return True 37 else: 38 print("路径为 ",path," 的文件夹已经存在") 39 return False 40 41 42 filepath = input("请输入保持图片的文件夹路径:") 43 print(filepath) 44 name = input("请输入保存图片的文件夹名:") 45 print(name) 46 finalpath = filepath + name 47 makedir(finalpath) 48 finalpath += '\\' 49 print(f"图片保存路径: {finalpath}") 50 51 Download(get_Imgurl_list(first_url),page_flag,finalpath,Imgnums) 52 mysoup = BeautifulSoup(get_html(first_url),'html.parser') 53 next_page = mysoup.find('div',attrs = {'class':'page'}).find('li',text = '下一页').find('a') 54 while next_page: 55 new_url = base_url + next_page['href'] 56 page_flag += 1 57 Download(get_Imgurl_list(get_html(new_url)),page_flag,finalpath,Imgnums) 58 mysoup = BeautifulSoup(get_html(new_url),'html.parser') 59 next_page = mysoup.find('div',attrs = {'class':'page'}).find('li',text = '下一页').find('a') 60 print(f"下载完成,共下载了 {Imgnums} 张图片!")
运行截图:
图片名命名规则:存储路径+页码+下划线+图片号+.jpg
图片文件夹截图: