這是晚上沒事無聊寫的python爬蟲小程序,專門爬取妹子圖的,養眼用的,嘻嘻!身為程序狗只會這個了!
廢話不多說,代碼附上,僅供參考學習!
""" 功能:爬取妹子圖全網妹子圖片,可以選擇爬取年份,自動分類保存 作者:68喜科技 """ import requests from lxml import etree # import re import os # from time import sleep class Meizitu(object): """爬取妹子圖中的圖片""" def __init__(self, year): self.url = "http://www.mzitu.com/all/" self.headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"} self.year = year # 獲取頁面 def get_page(self, url, headers): response = requests.get(url, headers=headers) return response.content.decode() # 提取列表頁中的urls def get_detail_urls_list(self, page_content, year): html_content = etree.HTML(page_content) year_list = html_content.xpath("//div[@class='year']/text()") index = 2019 - year # 提取某一年的相關主題的urls xpath_var = "//div[@class='year'][{}]/following-sibling::*[1]//p[@class='url']/a/@href".format(index) if index <= len(year_list): urls_list = html_content.xpath(xpath_var) # print(urls_list) else: urls_list = None return urls_list # 構造保存路徑並創建目錄 def save_path(self, detail_html_content, first_img_url, img_name): # 構造保存路徑 path_prefix1 = detail_html_content.xpath("//div[@class='currentpath']/a/text()")[1] # print(path_prefix1) path_prefix2 = first_img_url[20:29] # print(path_prefix2) save_path = "./妹子圖/" + path_prefix1 + path_prefix2 + img_name + "/" # 如果目錄不存在,則創建目錄 if not os.path.exists(save_path): os.makedirs(save_path) return save_path # 請求和保存圖片 def save_img(self, img_url, img_headers, img_save_path): # 請求圖片 img_content = requests.get(img_url, headers=img_headers).content # 保存圖片 with open(img_save_path, "wb") as f: f.write(img_content) # 構造圖片請求地址 def img_url(self, first_img_url, img_index): if img_index < 10: img_url = first_img_url[:32] + "0" + str(img_index) + ".jpg" else: img_url = first_img_url[:32] + str(img_index) + ".jpg" # print(img_url) return img_url # 構造圖片的請求頭 def img_headers(self, url, img_index): if img_index == 1 : refer_url = url else: refer_url = url + "/" + str(img_index) # print(refer_url) img_headers = { # "Accept":"image/webp,image/apng,image/*,*/*;q=0.8", # "Accept-Encoding":"gzip, deflate", # "Accept-Language":"zh-CN,zh;q=0.9", # "Connection":"keep-alive", "Host":"i.meizitu.net", "Referer":refer_url, "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } # print(img_headers,end="\n\n") return img_headers # 構造每個主題的圖片請求地址 並保存 def get_img_urls(self, url, detail_html_content, first_img_url, img_name, save_path): # 每個主題中的圖片總數 img_total_num = int(detail_html_content.xpath("//div[@class='pagenavi']/a/span/text()")[4]) # 構造圖片地址 http://i.meizitu.net/2018/02/18c01.jpg for img_index in range(1, img_total_num + 1): img_url = self.img_url(first_img_url, img_index) img_headers = self.img_headers(url, img_index) # 構造圖片具體保存路徑 img_save_path = save_path + img_name + str(img_index) + ".jpg" # sleep(10) # 請求和保存圖片 self.save_img(img_url, img_headers, img_save_path) # 獲取圖片 def get_image(self, detail_urls_list): for url in detail_urls_list: detail_page = self.get_page(url, headers=self.headers) detail_html_content = etree.HTML(detail_page) # 第一頁圖片地址 first_img_url = detail_html_content.xpath("//div[@class='main-image']/p/a/img/@src")[0] # print(first_img_url) # 獲取圖片保存的名字 img_name = detail_html_content.xpath("//h2[@class='main-title']/text()")[0] # print(img_name) # 構建保存路徑並創建目錄 save_path = self.save_path(detail_html_content, first_img_url, img_name) # 構建圖片請求地址並下載 self.get_img_urls(url, detail_html_content, first_img_url, img_name, save_path) # 啟動爬蟲 def run_spider(self): # 獲取妹子圖中的列表頁內容 page_content = self.get_page(self.url, self.headers) # 獲取詳情頁的地址列表 detail_urls_list = self.get_detail_urls_list(page_content, self.year) # 獲取圖片 self.get_image(detail_urls_list) if __name__ == "__main__": year = int(input("請輸入您要爬取的年份:")) meizitu = Meizitu(year) meizitu.run_spider()