爬取前程無憂職位信息

本文轉載自查看原文 2020-04-23 22:32 792

一主題網絡爬蟲設計方案

1.主題式網絡爬蟲名稱：爬取前程無憂職位信息

2.主題式網絡爬蟲爬取的內容

本爬蟲就要爬取公司名稱,工作地點，薪資，學歷，工作經驗，招聘人數，公司規模，公司類型，公司福利和發布時間。

3.主題式網絡爬蟲設計方案概述

實驗思路：爬取數據，數據清洗，數據可視化。

二.主題頁面結構的結構特征分析

打開前程無憂，找到職位搜索，點右鍵檢查元素。

爬取信息，儲存在Excel中

import urllib.request
import xlwt
import re
import urllib.parse
#import time
header={
        'Host':'search.51job.com',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'MOzilla/5.0(Windows NT 10.0;Win64; x64) AppleWebkit/537.36(KHTML,like Gecko) chrome/78.0.3904.108 safari/537.36'
        }
def getfront(page,item):   #page是頁數，item是輸入的字符串，見后文
    result = urllib.parse.quote(item)   #先把字符串轉成十六進制編碼
    ur1 = result+',2,'+str(page)+'.html'
    ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'
    res = ur2+ur1
    a = urllib.request.urlopen(res)
    html = a.read().decode('gbk')   #讀取源代碼並轉為unicode
    return html
def getInformation(html):
    reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?',re.S)#匹配換行符
    items=re.findall(reg,html)
    return items
#新建表格空間
excel1 = xlwt.Workbook()
# 設置單元格格式
sheet1 = excel1.add_sheet('Job', cell_overwrite_ok=True)
sheet1.write(0, 0, '序號')
sheet1.write(0, 1, '職位')
sheet1.write(0, 2, '公司名稱')
sheet1.write(0, 3, '公司地點')
sheet1.write(0, 4, '公司性質')
sheet1.write(0, 5, '薪資')
sheet1.write(0, 6, '學歷要求')
sheet1.write(0, 7, '工作經驗')
sheet1.write(0, 8, '公司規模')
sheet1.write(0, 9, '公司類型')
sheet1.write(0, 10,'公司福利')
sheet1.write(0, 11,'發布時間')
number = 1
item = input()
for j in range(1,10000):   #頁數自己隨便改
    try:
        print("正在爬取第"+str(j)+"頁數據...")
        html = getfront(j,item)      #調用獲取網頁原碼
        for i in getInformation(html):
            try:
                url1 = i[1]          #職位網址
                res1 = urllib.request.urlopen(url1).read().decode('gbk')
                company = re.findall(re.compile(r'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?',re.S),res1)
                job_need = re.findall(re.compile(r'<p class="msg ltype".*?>.*?  <span>|</span>  (.*?)  <span>|</span>  (.*?)  <span>|</span>  .*?</p>',re.S),res1)
                welfare = re.findall(re.compile(r'<span class="sp4">(.*?)</span>',re.S),res1)
                print(i[0],i[2],i[4],i[5],company[0][0],job_need[2][0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
                sheet1.write(number,0,number)
                sheet1.write(number,1,i[0])
                sheet1.write(number,2,i[2])
                sheet1.write(number,3,i[4])
                sheet1.write(number,4,company[0][0])
                sheet1.write(number,5,i[5])
                sheet1.write(number,6,job_need[1][0])
                sheet1.write(number,7,job_need[2][0])
                sheet1.write(number,8,company[0][1])
                sheet1.write(number,9,company[0][2])
                sheet1.write(number,10,("  ".join(str(i) for i in welfare)))
                sheet1.write(number,11,i[6])
                number+=1
                excel1.save("51job.xls")
                time.sleep(0.3) #休息間隔，避免爬取海量數據時被誤判為攻擊，IP遭到封禁
            except:
                pass
    except:
        pass

數據清洗：

1.首先打開文件，出現有空值（NAN）的信息，直接刪除整行，職位出錯，及其他地方信息出錯，如在學歷中“召幾人”，薪資單位不一致並保存到另一個文件。

#coding:utf-8
import pandas as pd
import re
#除此之外還要安裝xlrd包

data = pd.read_excel(r'51job.xls',sheet_name='Job')
result = pd.DataFrame(data)
a = result.dropna(axis=0,how='any')
pd.set_option('display.max_rows',None)     #輸出全部行，不省略
b = u'數據'
number = 1
li = a['職位']
for i in range(0,len(li)):
    try:
        if b in li[i]:
            #print(number,li[i])
            number+=1
        else:
            a = a.drop(i,axis=0)
    except:
        pass
    
b2= u'人'
li2 = a['學歷要求']
for i in range(0,len(li2)):
    try:
        if b2 in li2[i]:
            #print(number,li2[i])
            number+=1
            a = a.drop(i,axis=0)
    except:
        pass

b3 =u'萬/年'
b4 =u'千/月'
li3 = a['薪資']
#注釋部分的print都是為了調試用的
for i in range(0,len(li3)):
    try:
        if b3 in li3[i]:
            x = re.findall(r'\d*\.?\d+',li3[i])
            #print(x)
            min_ = format(float(x[0])/12,'.2f')              #轉換成浮點型並保留兩位小數
            max_ = format(float(x[1])/12,'.2f')
            li3[i][1] = min_+'-'+max_+u'萬/月'
        if b4 in li3[i]:
            x = re.findall(r'\d*\.?\d+',li3[i])
            #print(x)
            #input()
            min_ = format(float(x[0])/10,'.2f')
            max_ = format(float(x[1])/10,'.2f')
            li3[i][1] = str(min_+'-'+max_+'萬/月')
        print(i,li3[i])

    except:
        pass

#保存到另一個文件
a.to_excel('51job2.xls', sheet_name='Job', index=False)

數據可視化：

繪制工作經驗-薪資圖、學歷-薪資圖、學歷圓環圖：

先打開文件，創建多個列表單獨存放‘薪資’，‘學歷要求’等信息。

file = pd.read_excel(r'51job2.xls',sheet_name='Job')
f = pd.DataFrame(file)
pd.set_option('display.max_rows',None)
add = f['公司地點']
sly = f['薪資']
edu = f['學歷要求']
exp = f['工作經驗']
address =[]
salary = []
education = []
experience = []
for i in range(0,len(f)):
    try:
        a = add[i].split('-')
        address.append(a[0])
        #print(address[i])
        s = re.findall(r'\d*\.?\d+',sly[i])
        s1= float(s[0])
        s2 =float(s[1])
        salary.append([s1,s2])
        #print(salary[i])
        education.append(edu[i])
        #print(education[i])
        experience.append(exp[i])
        #print(experience[i])
    except:
       pass
   
min_s=[]							#定義存放最低薪資的列表
max_s=[]							#定義存放最高薪資的列表
for i in range(0,len(experience)):
    min_s.append(salary[i][0])
    max_s.append(salary[i][0])

my_df = pd.DataFrame({'experience':experience, 'min_salay' : min_s, 'max_salay' : max_s})				#關聯工作經驗與薪資
data1 = my_df.groupby('experience').mean()['min_salay'].plot(kind='line')
plt.show()
my_df2 = pd.DataFrame({'education':education, 'min_salay' : min_s, 'max_salay' : max_s})				#關聯學歷與薪資
data2 = my_df2.groupby('education').mean()['min_salay'].plot(kind='line')
plt.show()
    
def get_edu(list):
    education2 = {}
    for i in set(list):
        education2[i] = list.count(i)
    return education2
dir1 = get_edu(education)
# print(dir1)

attr= dir1.keys()
value = dir1.values()
pie = Pie("學歷要求")
pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype='radius',
        is_legend_show=False, is_label_show=True,legend_orient='vertical')
pie.render('學歷要求玫瑰圖.html')

所有代碼，如下：

import urllib.request
import xlwt
import re
import urllib.parse
#import time
header={
 'Host':'search.51job.com',
 'Upgrade-Insecure-Requests':'1',
 'User-Agent':'MOzilla/5.0(Windows NT 10.0;Win64; x64) AppleWebkit/537.36(KHTML,like Gecko) chrome/78.0.3904.108 safari/537.36'
 }
def getfront(page,item): #page是頁數，item是輸入的字符串，見后文
 result = urllib.parse.quote(item) #先把字符串轉成十六進制編碼
 ur1 = result+',2,'+str(page)+'.html'
 ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'
 res = ur2+ur1
 a = urllib.request.urlopen(res)
 html = a.read().decode('gbk') #讀取源代碼並轉為unicode
 return html
def getInformation(html):
 reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <a target="_blank" title="(.*?)" href="(.*?)".*?(.*?).*?(.*?).*?(.*?).*?',re.S)#匹配換行符
 items=re.findall(reg,html)
 return items
#新建表格空間
excel1 = xlwt.Workbook()
# 設置單元格格式
sheet1 = excel1.add_sheet('Job', cell_overwrite_ok=True)
sheet1.write(0, 0, '序號')
sheet1.write(0, 1, '職位')
sheet1.write(0, 2, '公司名稱')
sheet1.write(0, 3, '公司地點')
sheet1.write(0, 4, '公司性質')
sheet1.write(0, 5, '薪資')
sheet1.write(0, 6, '學歷要求')
sheet1.write(0, 7, '工作經驗')
sheet1.write(0, 8, '公司規模')
sheet1.write(0, 9, '公司類型')
sheet1.write(0, 10,'公司福利')
sheet1.write(0, 11,'發布時間')
number = 1
item = input()
for j in range(1,10000): #頁數自己隨便改
 try:
 print("正在爬取第"+str(j)+"頁數據...")
 html = getfront(j,item) #調用獲取網頁原碼
 for i in getInformation(html):
 try:
 url1 = i[1] #職位網址
 res1 = urllib.request.urlopen(url1).read().decode('gbk')
 company = re.findall(re.compile(r'<div class="com_tag">.*?.*?.*?.*?',re.S),res1)
 job_need = re.findall(re.compile(r'.*?  |  (.*?)  |  (.*?)  |  .*?',re.S),res1)
 welfare = re.findall(re.compile(r'(.*?)',re.S),res1)
 print(i[0],i[2],i[4],i[5],company[0][0],job_need[2][0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
 sheet1.write(number,0,number)
 sheet1.write(number,1,i[0])
 sheet1.write(number,2,i[2])
 sheet1.write(number,3,i[4])
 sheet1.write(number,4,company[0][0])
 sheet1.write(number,5,i[5])
 sheet1.write(number,6,job_need[1][0])
 sheet1.write(number,7,job_need[2][0])
 sheet1.write(number,8,company[0][1])
 sheet1.write(number,9,company[0][2])
 sheet1.write(number,10,(" ".join(str(i) for i in welfare)))
 sheet1.write(number,11,i[6])
 number+=1
 excel1.save("51job.xls")
 time.sleep(0.3) #休息間隔，避免爬取海量數據時被誤判為攻擊，IP遭到封禁
 except:
 pass
 except:
 pass

#coding:utf-8
import pandas as pd
import re
#除此之外還要安裝xlrd包

data = pd.read_excel(r'51job.xls',sheet_name='Job')
result = pd.DataFrame(data)
a = result.dropna(axis=0,how='any')
pd.set_option('display.max_rows',None)     #輸出全部行，不省略
b = u'數據'
number = 1
li = a['職位']
for i in range(0,len(li)):
    try:
        if b in li[i]:
            #print(number,li[i])
            number+=1
        else:
            a = a.drop(i,axis=0)
    except:
        pass

b2= u'人'
li2 = a['學歷要求']
for i in range(0,len(li2)):
    try:
        if b2 in li2[i]:
            #print(number,li2[i])
            number+=1
            a = a.drop(i,axis=0)
    except:
        pass

b3 =u'萬/年'
b4 =u'千/月'
li3 = a['薪資']
#注釋部分的print都是為了調試用的
for i in range(0,len(li3)):
    try:
        if b3 in li3[i]:
            x = re.findall(r'\d*\.?\d+',li3[i])
            #print(x)
            min_ = format(float(x[0])/12,'.2f')              #轉換成浮點型並保留兩位小數
            max_ = format(float(x[1])/12,'.2f')
            li3[i][1] = min_+'-'+max_+u'萬/月'
        if b4 in li3[i]:
            x = re.findall(r'\d*\.?\d+',li3[i])
            #print(x)
            #input()
            min_ = format(float(x[0])/10,'.2f')
            max_ = format(float(x[1])/10,'.2f')
            li3[i][1] = str(min_+'-'+max_+'萬/月')
        print(i,li3[i])

except:
pass

#保存到另一個文件
a.to_excel('51job2.xls', sheet_name='Job', index=False)

file = pd.read_excel(r'51job2.xls',sheet_name='Job')
f = pd.DataFrame(file)
pd.set_option('display.max_rows',None)
add = f['公司地點']
sly = f['薪資']
edu = f['學歷要求']
exp = f['工作經驗']
address =[]
salary = []
education = []
experience = []
for i in range(0,len(f)):
    try:
        a = add[i].split('-')
        address.append(a[0])
        #print(address[i])
        s = re.findall(r'\d*\.?\d+',sly[i])
        s1= float(s[0])
        s2 =float(s[1])
        salary.append([s1,s2])
        #print(salary[i])
        education.append(edu[i])
        #print(education[i])
        experience.append(exp[i])
        #print(experience[i])
    except:
       pass

min_s=[]       #定義存放最低薪資的列表
max_s=[]       #定義存放最高薪資的列表
for i in range(0,len(experience)):
    min_s.append(salary[i][0])
    max_s.append(salary[i][0])

my_df = pd.DataFrame({'experience':experience, 'min_salay' : min_s, 'max_salay' : max_s})    #關聯工作經驗與薪資
data1 = my_df.groupby('experience').mean()['min_salay'].plot(kind='line')
plt.show()
my_df2 = pd.DataFrame({'education':education, 'min_salay' : min_s, 'max_salay' : max_s})    #關聯學歷與薪資
data2 = my_df2.groupby('education').mean()['min_salay'].plot(kind='line')
plt.show()

def get_edu(list):
    education2 = {}
    for i in set(list):
        education2[i] = list.count(i)
    return education2
dir1 = get_edu(education)
# print(dir1)

attr= dir1.keys()
value = dir1.values()
pie = Pie("學歷要求")
pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype='radius',
is_legend_show=False, is_label_show=True,legend_orient='vertical')
pie.render('學歷要求玫瑰圖.html')

總結：

1.經過對主題數據的分析與可視化，可以得到哪些結論？

數據可視化可以讓我們對網頁的內容更清晰，更直觀。

2.小結

經過這段時間的學習，我認識到學Python太難了，由於英語不扎實，經常要查找英語單詞，在find_all上徘徊了很久，運行不了，最后還是沒搞懂，今后需要更多時間投入。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python爬取前程無憂職位信息爬取前程無憂官網搜索大數據職位信息基於Scrapy爬取前程無憂招聘信息爬取前程無憂網站上python的招聘信息。前程無憂數據爬取 python scrapy爬取前程無憂招聘信息使用Python爬取、清洗並分析前程無憂的大數據職位 Python爬蟲學習(二) ——————爬取前程無憂招聘信息並寫入excel Python爬取前程無憂網站上python的招聘信息前程無憂——數據分析崗位爬取