使用requests 代替urllib urllib2
使用os模塊操作文件夾
將爬取得圖片保存到本地
使用BeautifulSoup代替正則提取頁面內容
附爬蟲代碼
# -*- coding: utf-8 -*- # 爬取美女網站 import requests import os import time from bs4 import BeautifulSoup class MeiNv: def __init__(self,path): self.filePath = path self.headers = {"user-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0"} # 發起request請求 def doRequest(self, url): html = requests.get(url, headers = self.headers) return html.text # 得到圖片的信息 def doSoup(self, content): con_soup = BeautifulSoup(content, 'lxml') a_list = con_soup.find("div", class_="all").find_all('a') for item in a_list: # 連接名字,作為文件夾名字 title = item.get_text() self.mkdir(title) # 取出值中的圖片位置 page = item['href'] page_html = self.doRequest(page) # 匹配圖片的數目 html_soup = BeautifulSoup(page_html,'lxml') max_span = html_soup.find('div', class_='pagenavi').find_all('span')[-2].get_text() for i in range(1,int(max_span)+1): time.sleep(1) page_url = page + '/' + str(i) # 讀取圖片的信息 img_html = self.doRequest(page_url) imghtml_soup = BeautifulSoup(img_html, 'lxml') img_url = imghtml_soup.find('div', class_ = 'main-image').find('img')['src'] name = img_url[-9:-4] img = requests.get(img_url, headers = self.headers) self.writeToFile(name, img.content) # 將圖片信息寫入文件中 def writeToFile(self, filename, content): f = open(filename+'.jpg','wb') f.write(content) f.close() # 創建目錄 def mkdir(self, path): path = path.strip() isEXists = os.path.exists(os.path.join("D:\meinv\\", path)) if not isEXists: print (u'創建了一個名為%s的文件夾'%(path)) os.makedirs(os.path.join(self.filePath, path)) os.chdir(os.path.join(self.filePath, path)) else: print(u'名字叫做', path, u'的文件夾已經存在了!') return False def start(self, url): content = self.doRequest(url) contents = self.doSoup(content) self.writeToFile(contents) # print content url = "http://www.mzitu.com/all" path = "D:\meinv\\" meinv = MeiNv(path) meinv.start(url)