import requests import os from bs4 import BeautifulSoup import re # 初始地址 all_url = 'http://www.7160.com/xiaohua/' #保存路徑 path = 'H:/school_girl/' # 請求頭 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36' } #################################開始請求(多列表)################################# html = requests.get(all_url,headers = header) start_html = html.text.encode('iso-8859-1').decode('gbk') # 將gb2312轉為UTF-8格式 #################################開始解析################################# soup = BeautifulSoup(start_html,'lxml') #查找最大頁碼 page = 255 # 同一路徑 same_url = 'http://www.7160.com/xiaohua/' for n in range(1,int(page)+1): ul = same_url + 'list_6_' + str(n) + '.html' ####################開始請求(單列表多元素)############### html = requests.get(ul,headers = header) start_html = html.text.encode('iso-8859-1').decode('gbk') ########################開始解析########################## soup = BeautifulSoup(start_html,'lxml') all_a = soup.find('div',class_='news_bom-left').find_all('a',target = '_blank') for a in all_a: title = a.get_text() if title != '': ########################創建目錄########################## #win不能創建帶?的目錄 if (os.path.exists(path + title.strip().replace('?', ''))): # print('目錄已存在') flag = 1 else: os.makedirs(path + title.strip().replace('?', '')) flag = 0 os.chdir(path + title.strip().replace('?', '')) ######################### END ########################### ###################開始請求(單元素)############### print('准備爬取:' + title) hrefs = a['href'] in_url = 'http://www.7160.com' href = in_url + hrefs htmls = requests.get(href,headers = header) html = htmls.text.encode('iso-8859-1').decode('gbk') #######################開始解析###################### mess = BeautifulSoup(html,'lxml') titles = mess.find('h1').text pic_max = mess.find('div',class_ = 'itempage').find_all('a')[-2].text # 最大頁數 if (flag == 1 and len(os.listdir(path + title.strip().replace('?', ''))) >= int(pic_max)): print('已經保存完畢,跳過') continue for num in range(1,int(pic_max)+1): href = a['href'] hrefs = re.findall(r'.{14}',href) href = "".join(hrefs) if num == 1: html = in_url + href + '.html' else: html = in_url + href + '_' + str(num) + ".html" ###################開始請求(單元素里的子元素)############### htmls = requests.get(html,headers = header) html = htmls.text.encode('iso-8859-1').decode('gbk') #######################開始解析###################### mess = BeautifulSoup(html,'lxml') pic_url = mess.find('img',alt = titles) print(pic_url['src']) #########################開始下載##################### html = requests.get(pic_url['src'],headers = header) filename = pic_url['src'].split(r'/')[-1] f = open(filename,'wb') f.write(html.content) f.close() print('完成') print('第',n,'頁完成')
打印后的結果為:
准備爬取:
陽光下校花美女迷人桃花眼嘴
http://img.7160.com/uploads/allimg/180913/13-1P913102541.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102541-50.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102541-51.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-50.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-51.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-52.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-53.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102542-54.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102543.jpg
http://img.7160.com/uploads/allimg/180913/13-1P913102543-50.jpg
完成
准備爬取:
黑長直發美女學生日系風制服
http://img.7160.com/uploads/allimg/180912/13-1P912102159.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-50.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-51.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102159-52.jpg
http://img.7160.com/uploads/allimg/180912/13-1P912102200.jpg