# -*- coding: utf-8 -*- # @Time : 2018/3/1 16:38 # @Author : HT # @Email : acer_yuhaitao@163.com # @File : 51job.py # @Software: PyCharm import urllib import re import sqlite3 import sys reload(sys) sys.setdefaultencoding('utf8')#處理打印中文字體用Unicode編碼 i = 0#統計爬取總條目 def url_input(url): """ 獲取網頁源碼html信息 """ get_html = urllib.urlopen(url) read_html = get_html.read().decode('gbk') return read_html def find_data(html): """ 用正則表達式獲取需要的信息 """ reg = re.compile(r'class="t1 ">.*?<a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S) items = re.findall(reg,html) return items def find_all_page(html): """ 從第一頁中獲取總頁數 """ reg = re.compile(r'<span class="td">(.*?)</span><input id="jump_page" class="mytxt" type="text" value="1"/>',re.S) page_all = re.findall(reg, html) num = re.sub("\D", "", page_all[0])#從共5頁中提取數字 return num def data_to_sqlite(id,job,company,address,wages,date,jobname): """ 將信息存儲到數據庫 """ db = sqlite3.connect("D:\Python-Test\WeiXin\db.sqlite3") cursor = db.cursor() sql = "insert into '51job'(job,company,address,wages,date,jobname) values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\");"%(job,company,address,wages,date,jobname) try: cursor.execute(sql) db.commit() except Exception as e: print("ERRO:",e) def data_to_txt(str,jobname): """ 將信息存儲到文本 """ with open(u"51job%s.txt"%(jobname),'a+') as f: f.write(str) def print_items(data_items,jobname): """ 從正則匹配后的列表中獲取信息存儲打印 """ global i for data in data_items: job = data[0] company = data[1] address = data[2] wages = data[3] date = data[4] i = i + 1 str1 ="["+str(i)+"] "+ job+"--"+company+"--"+address+"--"+wages+"--"+date+"\n" data_to_txt(str1,jobname)#存到文本 data_to_sqlite(id, job, company, address, wages, date,jobname)#存到數據庫 print(str1) def urlformat(urlstart): """ 返回{}.html格式字符串 """ url = re.sub('1.html','{}.html',urlstart) return url def get_page_html(page_num,urlstart): """ 輸入中頁數,返回每一頁的url """ list=[] for i in range(page_num): url = urlformat(urlstart) url = url.format(i) list.append(url) return list def all_job_get(): """ 輸入多個職位名稱及第一頁url批量抓取 """ urldict = [ { 'jobname': "python", 'urlstart': 'http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, { 'jobname': u"嵌入式", 'urlstart': 'http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, { 'jobname': u"雲計算", 'urlstart': 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%2591%25E8%25AE%25A1%25E7%25AE%2597,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, { 'jobname': u"機器學習", 'urlstart': 'http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, { 'jobname': u"人工智能", 'urlstart': 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, { 'jobname': u"自動駕駛", 'urlstart': 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E9%25A9%25BE%25E9%25A9%25B6,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, { 'jobname': u"北上廣深python", 'urlstart': 'http://search.51job.com/list/010000%252C040000%252C020000%252C030200,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' }, ] for data in urldict: jobname = data['jobname'] urlstart = data['urlstart'] html = url_input(urlstart) all_page_num = int(find_all_page(html)) print("+++++++++++++++++%s++++++++++++++++++++" % (all_page_num)) urllist = get_page_html(all_page_num, urlstart) for url in urllist: html = url_input(url) data_items = find_data(html) print_items(data_items, jobname) i = 0#批量抓取后換個職位重新計數 def one_job_get(): """ 單個職位信息抓取 """ # jobname = "python" # urlstart = 'http://search.51job.com/list/010000,000000,0000,00,9,99,Python%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' # jobname = u"嵌入式" # urlstart = 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E5%25B5%258C%25E5%2585%25A5%25E5%25BC%258F%25E5%25BC%2580%25E5%258F%2591,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' # jobname =u"雲計算" # urlstart ='http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%2591%25E8%25AE%25A1%25E7%25AE%2597,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' # jobname =u"機器學習" # urlstart = 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' # jobname =u"人工智能" # urlstart = 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' # jobname =u"自動駕駛" # urlstart = 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E9%25A9%25BE%25E9%25A9%25B6,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' #jobname =u"北上廣深python" #urlstart = 'http://search.51job.com/list/010000%252C040000%252C020000%252C030200,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' jobname = u"BJ技術支持" urlstart = 'http://search.51job.com/list/010000,000000,0000,00,9,99,%25E6%258A%2580%25E6%259C%25AF%25E6%2594%25AF%25E6%258C%2581,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' html = url_input(urlstart)#獲取首頁 all_page_num = int(find_all_page(html))#從首頁獲取總共頁數 print("+++++++++++++++++%s++++++++++++++++++++" % (all_page_num)) urllist = get_page_html(all_page_num, urlstart)#獲取每一頁url存到列表里 for url in urllist:#從列表里迭代每一頁url html = url_input(url)#獲取頁面url data_items = find_data(html)#查找信息返回職位等信息 print_items(data_items, jobname)#將信息存到文本信息和數據庫 i = 0 if __name__ == '__main__': #all_job_get() one_job_get()