import requests import re import time from lxml import etree import pymysql class my_spider: #初始化(第一步) def __init__(self,num1,num2): self.base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html" self.headers = { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding":"gzip,deflate,br", "Accept-Language":"zh-CN,zh;q=0.9", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Host":"search.51job.com", "Sec-Fetch-Mode":"navigate", "Sec-Fetch-Site":"none", "Sec-Fetch-User":"?1", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36" } self.page_num1 = num1 self.page_num2 = num2 #定義一個存放詳細頁連接的列表,方便取數 self.det_link = [] #構建頁面列表連接(第一步) def get_url(self): url_List = [] for i in range(self.page_num1,self.page_num2): url_List.append(self.base_url.format(i)) return url_List #獲得主列表頁面信息(第一步) def get_pages(self,url): proxy={ "http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020", "https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020" } response = requests.get(url=url,headers=self.headers) #print(response.content.decode('gbk')) return self.parse_pages(response.content.decode('gbk')) #解析主列表信息連接(第一步) def parse_pages(self,text): html_5job = etree.HTML(text) all_div = html_5job.xpath("//div[@id='resultList']//div[@class='el']") info_List = [] for item in all_div: info = {} info['job_info_link'] = item.xpath("./p/span/a/@href")[0] info_List.append(info) return info_List #定義函數循環抽取頁面信息 def run(self): index_urlList = self.get_url() #print(index_urlList) for url in index_urlList: time.sleep(1) page_info = self.get_pages(url) #print(page_info,"打印結果") for job_info_link in page_info: self.det_link.append(job_info_link['job_info_link']) #獲得頁面信息 def get_page_info(self,url): url = url print(url) proxy={ "http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020", "https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020" } response = requests.get(url=url,headers=self.headers) #print(response.content.decode('gbk')) return self.parse_det_info(response.content.decode('gbk')) #解析詳細信息 def parse_det_info(self,pages): item = etree.HTML(pages) #all_div = html_51job.xpath("//div[@class='cn']") #print(all_div) #info_List = [] #for item in all_div: info = {} try: info['job_name'] = item.xpath("//div[@class='cn']/h1/@title")[0] except IndexError: info['job_name'] = 'NaN' try: info['job_money'] = item.xpath("//div[@class='cn']/strong/text()")[0] # 這里報錯 except IndexError: info['job_money'] = 'NaN' try: info['company_name'] = item.xpath("//div[@class='cn']/p[@class='cname']/a/@title")[0] except IndexError: info['company_name'] = 'NaN' try: info['job_request'] = item.xpath("//div[@class='cn']/p[@class='msg ltype']/@title")[0] except IndexError: info['job_request'] = 'NaN' #info_List.append(info) return info #main def main(self): self.run() print(self.det_link) for url in self.det_link: #print(url) time.sleep(1) det_pageinfo = self.get_page_info(url) print(det_pageinfo) self.save_to_mysql(det_pageinfo) #保存數據 def save_to_mysql(self, page_Info): # 鏈接數據庫 conn = pymysql.connect(host='localhost', user='root', passwd='root123', db='baidu', port=3306) # 游標對象 cursor = conn.cursor() # 插入數據 tt = page_Info cursor.execute("insert into det_job_info(job_name,company_name,job_money,job_request) VALUES('{}','{}','{}','{}')".format(tt['job_name'],tt['company_name'],tt['job_money'],tt['job_request'])) conn.commit() # 關閉游標,關閉連接 cursor.close() conn.close() if __name__ == "__main__": #spider.get_pages() #spider.get_url() for i in range(159,159,2): time.sleep(1) spider = my_spider(159,161) print('正在獲取{}-{}頁數據'.format(i,i+2)) spider.main()