一主題網絡爬蟲設計方案
1.主題式網絡爬蟲名稱:爬取前程無憂職位信息
2.主題式網絡爬蟲爬取的內容
本爬蟲就要爬取公司名稱,工作地點,薪資,學歷,工作經驗,招聘人數,公司規模,公司類型,公司福利和發布時間。
3.主題式網絡爬蟲設計方案概述
實驗思路:爬取數據,數據清洗,數據可視化。
二.主題頁面結構的結構特征分析
打開前程無憂,找到職位搜索,點右鍵檢查元素。
爬取信息,儲存在Excel中
import urllib.request import xlwt import re import urllib.parse #import time header={ 'Host':'search.51job.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'MOzilla/5.0(Windows NT 10.0;Win64; x64) AppleWebkit/537.36(KHTML,like Gecko) chrome/78.0.3904.108 safari/537.36' } def getfront(page,item): #page是頁數,item是輸入的字符串,見后文 result = urllib.parse.quote(item) #先把字符串轉成十六進制編碼 ur1 = result+',2,'+str(page)+'.html' ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,' res = ur2+ur1 a = urllib.request.urlopen(res) html = a.read().decode('gbk') #讀取源代碼並轉為unicode return html def getInformation(html): reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?',re.S)#匹配換行符 items=re.findall(reg,html) return items #新建表格空間 excel1 = xlwt.Workbook() # 設置單元格格式 sheet1 = excel1.add_sheet('Job', cell_overwrite_ok=True) sheet1.write(0, 0, '序號') sheet1.write(0, 1, '職位') sheet1.write(0, 2, '公司名稱') sheet1.write(0, 3, '公司地點') sheet1.write(0, 4, '公司性質') sheet1.write(0, 5, '薪資') sheet1.write(0, 6, '學歷要求') sheet1.write(0, 7, '工作經驗') sheet1.write(0, 8, '公司規模') sheet1.write(0, 9, '公司類型') sheet1.write(0, 10,'公司福利') sheet1.write(0, 11,'發布時間') number = 1 item = input() for j in range(1,10000): #頁數自己隨便改 try: print("正在爬取第"+str(j)+"頁數據...") html = getfront(j,item) #調用獲取網頁原碼 for i in getInformation(html): try: url1 = i[1] #職位網址 res1 = urllib.request.urlopen(url1).read().decode('gbk') company = re.findall(re.compile(r'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?',re.S),res1) job_need = re.findall(re.compile(r'<p class="msg ltype".*?>.*? <span>|</span> (.*?) <span>|</span> (.*?) <span>|</span> .*?</p>',re.S),res1) welfare = re.findall(re.compile(r'<span class="sp4">(.*?)</span>',re.S),res1) print(i[0],i[2],i[4],i[5],company[0][0],job_need[2][0],job_need[1][0],company[0][1],company[0][2],welfare,i[6]) sheet1.write(number,0,number) sheet1.write(number,1,i[0]) sheet1.write(number,2,i[2]) sheet1.write(number,3,i[4]) sheet1.write(number,4,company[0][0]) sheet1.write(number,5,i[5]) sheet1.write(number,6,job_need[1][0]) sheet1.write(number,7,job_need[2][0]) sheet1.write(number,8,company[0][1]) sheet1.write(number,9,company[0][2]) sheet1.write(number,10,(" ".join(str(i) for i in welfare))) sheet1.write(number,11,i[6]) number+=1 excel1.save("51job.xls") time.sleep(0.3) #休息間隔,避免爬取海量數據時被誤判為攻擊,IP遭到封禁 except: pass except: pass
數據清洗:
1.首先打開文件,出現有空值(NAN)的信息,直接刪除整行,職位出錯,及其他地方信息出錯,如在學歷中“召幾人”,薪資單位不一致並保存到另一個文件。
#coding:utf-8 import pandas as pd import re #除此之外還要安裝xlrd包 data = pd.read_excel(r'51job.xls',sheet_name='Job') result = pd.DataFrame(data) a = result.dropna(axis=0,how='any') pd.set_option('display.max_rows',None) #輸出全部行,不省略 b = u'數據' number = 1 li = a['職位'] for i in range(0,len(li)): try: if b in li[i]: #print(number,li[i]) number+=1 else: a = a.drop(i,axis=0) except: pass b2= u'人' li2 = a['學歷要求'] for i in range(0,len(li2)): try: if b2 in li2[i]: #print(number,li2[i]) number+=1 a = a.drop(i,axis=0) except: pass b3 =u'萬/年' b4 =u'千/月' li3 = a['薪資'] #注釋部分的print都是為了調試用的 for i in range(0,len(li3)): try: if b3 in li3[i]: x = re.findall(r'\d*\.?\d+',li3[i]) #print(x) min_ = format(float(x[0])/12,'.2f') #轉換成浮點型並保留兩位小數 max_ = format(float(x[1])/12,'.2f') li3[i][1] = min_+'-'+max_+u'萬/月' if b4 in li3[i]: x = re.findall(r'\d*\.?\d+',li3[i]) #print(x) #input() min_ = format(float(x[0])/10,'.2f') max_ = format(float(x[1])/10,'.2f') li3[i][1] = str(min_+'-'+max_+'萬/月') print(i,li3[i]) except: pass #保存到另一個文件 a.to_excel('51job2.xls', sheet_name='Job', index=False)
數據可視化:
繪制工作經驗-薪資圖、學歷-薪資圖、學歷圓環圖:
先打開文件,創建多個列表單獨存放‘薪資’,‘學歷要求’等信息。
file = pd.read_excel(r'51job2.xls',sheet_name='Job') f = pd.DataFrame(file) pd.set_option('display.max_rows',None) add = f['公司地點'] sly = f['薪資'] edu = f['學歷要求'] exp = f['工作經驗'] address =[] salary = [] education = [] experience = [] for i in range(0,len(f)): try: a = add[i].split('-') address.append(a[0]) #print(address[i]) s = re.findall(r'\d*\.?\d+',sly[i]) s1= float(s[0]) s2 =float(s[1]) salary.append([s1,s2]) #print(salary[i]) education.append(edu[i]) #print(education[i]) experience.append(exp[i]) #print(experience[i]) except: pass min_s=[] #定義存放最低薪資的列表 max_s=[] #定義存放最高薪資的列表 for i in range(0,len(experience)): min_s.append(salary[i][0]) max_s.append(salary[i][0]) my_df = pd.DataFrame({'experience':experience, 'min_salay' : min_s, 'max_salay' : max_s}) #關聯工作經驗與薪資 data1 = my_df.groupby('experience').mean()['min_salay'].plot(kind='line') plt.show() my_df2 = pd.DataFrame({'education':education, 'min_salay' : min_s, 'max_salay' : max_s}) #關聯學歷與薪資 data2 = my_df2.groupby('education').mean()['min_salay'].plot(kind='line') plt.show() def get_edu(list): education2 = {} for i in set(list): education2[i] = list.count(i) return education2 dir1 = get_edu(education) # print(dir1) attr= dir1.keys() value = dir1.values() pie = Pie("學歷要求") pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype='radius', is_legend_show=False, is_label_show=True,legend_orient='vertical') pie.render('學歷要求玫瑰圖.html')
所有代碼,如下:
import urllib.request
import xlwt
import re
import urllib.parse
#import time
header={
'Host':'search.51job.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'MOzilla/5.0(Windows NT 10.0;Win64; x64) AppleWebkit/537.36(KHTML,like Gecko) chrome/78.0.3904.108 safari/537.36'
}
def getfront(page,item): #page是頁數,item是輸入的字符串,見后文
result = urllib.parse.quote(item) #先把字符串轉成十六進制編碼
ur1 = result+',2,'+str(page)+'.html'
ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'
res = ur2+ur1
a = urllib.request.urlopen(res)
html = a.read().decode('gbk') #讀取源代碼並轉為unicode
return html
def getInformation(html):
reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?',re.S)#匹配換行符
items=re.findall(reg,html)
return items
#新建表格空間
excel1 = xlwt.Workbook()
# 設置單元格格式
sheet1 = excel1.add_sheet('Job', cell_overwrite_ok=True)
sheet1.write(0, 0, '序號')
sheet1.write(0, 1, '職位')
sheet1.write(0, 2, '公司名稱')
sheet1.write(0, 3, '公司地點')
sheet1.write(0, 4, '公司性質')
sheet1.write(0, 5, '薪資')
sheet1.write(0, 6, '學歷要求')
sheet1.write(0, 7, '工作經驗')
sheet1.write(0, 8, '公司規模')
sheet1.write(0, 9, '公司類型')
sheet1.write(0, 10,'公司福利')
sheet1.write(0, 11,'發布時間')
number = 1
item = input()
for j in range(1,10000): #頁數自己隨便改
try:
print("正在爬取第"+str(j)+"頁數據...")
html = getfront(j,item) #調用獲取網頁原碼
for i in getInformation(html):
try:
url1 = i[1] #職位網址
res1 = urllib.request.urlopen(url1).read().decode('gbk')
company = re.findall(re.compile(r'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?',re.S),res1)
job_need = re.findall(re.compile(r'<p class="msg ltype".*?>.*? <span>|</span> (.*?) <span>|</span> (.*?) <span>|</span> .*?</p>',re.S),res1)
welfare = re.findall(re.compile(r'<span class="sp4">(.*?)</span>',re.S),res1)
print(i[0],i[2],i[4],i[5],company[0][0],job_need[2][0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
sheet1.write(number,0,number)
sheet1.write(number,1,i[0])
sheet1.write(number,2,i[2])
sheet1.write(number,3,i[4])
sheet1.write(number,4,company[0][0])
sheet1.write(number,5,i[5])
sheet1.write(number,6,job_need[1][0])
sheet1.write(number,7,job_need[2][0])
sheet1.write(number,8,company[0][1])
sheet1.write(number,9,company[0][2])
sheet1.write(number,10,(" ".join(str(i) for i in welfare)))
sheet1.write(number,11,i[6])
number+=1
excel1.save("51job.xls")
time.sleep(0.3) #休息間隔,避免爬取海量數據時被誤判為攻擊,IP遭到封禁
except:
pass
except:
pass
import xlwt
import re
import urllib.parse
#import time
header={
'Host':'search.51job.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'MOzilla/5.0(Windows NT 10.0;Win64; x64) AppleWebkit/537.36(KHTML,like Gecko) chrome/78.0.3904.108 safari/537.36'
}
def getfront(page,item): #page是頁數,item是輸入的字符串,見后文
result = urllib.parse.quote(item) #先把字符串轉成十六進制編碼
ur1 = result+',2,'+str(page)+'.html'
ur2 = 'https://search.51job.com/list/000000,000000,0000,00,9,99,'
res = ur2+ur1
a = urllib.request.urlopen(res)
html = a.read().decode('gbk') #讀取源代碼並轉為unicode
return html
def getInformation(html):
reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)" href="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)" href="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>.*?',re.S)#匹配換行符
items=re.findall(reg,html)
return items
#新建表格空間
excel1 = xlwt.Workbook()
# 設置單元格格式
sheet1 = excel1.add_sheet('Job', cell_overwrite_ok=True)
sheet1.write(0, 0, '序號')
sheet1.write(0, 1, '職位')
sheet1.write(0, 2, '公司名稱')
sheet1.write(0, 3, '公司地點')
sheet1.write(0, 4, '公司性質')
sheet1.write(0, 5, '薪資')
sheet1.write(0, 6, '學歷要求')
sheet1.write(0, 7, '工作經驗')
sheet1.write(0, 8, '公司規模')
sheet1.write(0, 9, '公司類型')
sheet1.write(0, 10,'公司福利')
sheet1.write(0, 11,'發布時間')
number = 1
item = input()
for j in range(1,10000): #頁數自己隨便改
try:
print("正在爬取第"+str(j)+"頁數據...")
html = getfront(j,item) #調用獲取網頁原碼
for i in getInformation(html):
try:
url1 = i[1] #職位網址
res1 = urllib.request.urlopen(url1).read().decode('gbk')
company = re.findall(re.compile(r'<div class="com_tag">.*?<p class="at" title="(.*?)"><span class="i_flag">.*?<p class="at" title="(.*?)">.*?<p class="at" title="(.*?)">.*?',re.S),res1)
job_need = re.findall(re.compile(r'<p class="msg ltype".*?>.*? <span>|</span> (.*?) <span>|</span> (.*?) <span>|</span> .*?</p>',re.S),res1)
welfare = re.findall(re.compile(r'<span class="sp4">(.*?)</span>',re.S),res1)
print(i[0],i[2],i[4],i[5],company[0][0],job_need[2][0],job_need[1][0],company[0][1],company[0][2],welfare,i[6])
sheet1.write(number,0,number)
sheet1.write(number,1,i[0])
sheet1.write(number,2,i[2])
sheet1.write(number,3,i[4])
sheet1.write(number,4,company[0][0])
sheet1.write(number,5,i[5])
sheet1.write(number,6,job_need[1][0])
sheet1.write(number,7,job_need[2][0])
sheet1.write(number,8,company[0][1])
sheet1.write(number,9,company[0][2])
sheet1.write(number,10,(" ".join(str(i) for i in welfare)))
sheet1.write(number,11,i[6])
number+=1
excel1.save("51job.xls")
time.sleep(0.3) #休息間隔,避免爬取海量數據時被誤判為攻擊,IP遭到封禁
except:
pass
except:
pass
#coding:utf-8
import pandas as pd
import re
#除此之外還要安裝xlrd包
import pandas as pd
import re
#除此之外還要安裝xlrd包
data = pd.read_excel(r'51job.xls',sheet_name='Job')
result = pd.DataFrame(data)
a = result.dropna(axis=0,how='any')
pd.set_option('display.max_rows',None) #輸出全部行,不省略
b = u'數據'
number = 1
li = a['職位']
for i in range(0,len(li)):
try:
if b in li[i]:
#print(number,li[i])
number+=1
else:
a = a.drop(i,axis=0)
except:
pass
b2= u'人'
li2 = a['學歷要求']
for i in range(0,len(li2)):
try:
if b2 in li2[i]:
#print(number,li2[i])
number+=1
a = a.drop(i,axis=0)
except:
pass
result = pd.DataFrame(data)
a = result.dropna(axis=0,how='any')
pd.set_option('display.max_rows',None) #輸出全部行,不省略
b = u'數據'
number = 1
li = a['職位']
for i in range(0,len(li)):
try:
if b in li[i]:
#print(number,li[i])
number+=1
else:
a = a.drop(i,axis=0)
except:
pass
b2= u'人'
li2 = a['學歷要求']
for i in range(0,len(li2)):
try:
if b2 in li2[i]:
#print(number,li2[i])
number+=1
a = a.drop(i,axis=0)
except:
pass
b3 =u'萬/年'
b4 =u'千/月'
li3 = a['薪資']
#注釋部分的print都是為了調試用的
for i in range(0,len(li3)):
try:
if b3 in li3[i]:
x = re.findall(r'\d*\.?\d+',li3[i])
#print(x)
min_ = format(float(x[0])/12,'.2f') #轉換成浮點型並保留兩位小數
max_ = format(float(x[1])/12,'.2f')
li3[i][1] = min_+'-'+max_+u'萬/月'
if b4 in li3[i]:
x = re.findall(r'\d*\.?\d+',li3[i])
#print(x)
#input()
min_ = format(float(x[0])/10,'.2f')
max_ = format(float(x[1])/10,'.2f')
li3[i][1] = str(min_+'-'+max_+'萬/月')
print(i,li3[i])
b4 =u'千/月'
li3 = a['薪資']
#注釋部分的print都是為了調試用的
for i in range(0,len(li3)):
try:
if b3 in li3[i]:
x = re.findall(r'\d*\.?\d+',li3[i])
#print(x)
min_ = format(float(x[0])/12,'.2f') #轉換成浮點型並保留兩位小數
max_ = format(float(x[1])/12,'.2f')
li3[i][1] = min_+'-'+max_+u'萬/月'
if b4 in li3[i]:
x = re.findall(r'\d*\.?\d+',li3[i])
#print(x)
#input()
min_ = format(float(x[0])/10,'.2f')
max_ = format(float(x[1])/10,'.2f')
li3[i][1] = str(min_+'-'+max_+'萬/月')
print(i,li3[i])
except:
pass
pass
#保存到另一個文件
a.to_excel('51job2.xls', sheet_name='Job', index=False)
a.to_excel('51job2.xls', sheet_name='Job', index=False)
file = pd.read_excel(r'51job2.xls',sheet_name='Job')
f = pd.DataFrame(file)
pd.set_option('display.max_rows',None)
add = f['公司地點']
sly = f['薪資']
edu = f['學歷要求']
exp = f['工作經驗']
address =[]
salary = []
education = []
experience = []
for i in range(0,len(f)):
try:
a = add[i].split('-')
address.append(a[0])
#print(address[i])
s = re.findall(r'\d*\.?\d+',sly[i])
s1= float(s[0])
s2 =float(s[1])
salary.append([s1,s2])
#print(salary[i])
education.append(edu[i])
#print(education[i])
experience.append(exp[i])
#print(experience[i])
except:
pass
min_s=[] #定義存放最低薪資的列表
max_s=[] #定義存放最高薪資的列表
for i in range(0,len(experience)):
min_s.append(salary[i][0])
max_s.append(salary[i][0])
f = pd.DataFrame(file)
pd.set_option('display.max_rows',None)
add = f['公司地點']
sly = f['薪資']
edu = f['學歷要求']
exp = f['工作經驗']
address =[]
salary = []
education = []
experience = []
for i in range(0,len(f)):
try:
a = add[i].split('-')
address.append(a[0])
#print(address[i])
s = re.findall(r'\d*\.?\d+',sly[i])
s1= float(s[0])
s2 =float(s[1])
salary.append([s1,s2])
#print(salary[i])
education.append(edu[i])
#print(education[i])
experience.append(exp[i])
#print(experience[i])
except:
pass
min_s=[] #定義存放最低薪資的列表
max_s=[] #定義存放最高薪資的列表
for i in range(0,len(experience)):
min_s.append(salary[i][0])
max_s.append(salary[i][0])
my_df = pd.DataFrame({'experience':experience, 'min_salay' : min_s, 'max_salay' : max_s}) #關聯工作經驗與薪資
data1 = my_df.groupby('experience').mean()['min_salay'].plot(kind='line')
plt.show()
my_df2 = pd.DataFrame({'education':education, 'min_salay' : min_s, 'max_salay' : max_s}) #關聯學歷與薪資
data2 = my_df2.groupby('education').mean()['min_salay'].plot(kind='line')
plt.show()
def get_edu(list):
education2 = {}
for i in set(list):
education2[i] = list.count(i)
return education2
dir1 = get_edu(education)
# print(dir1)
data1 = my_df.groupby('experience').mean()['min_salay'].plot(kind='line')
plt.show()
my_df2 = pd.DataFrame({'education':education, 'min_salay' : min_s, 'max_salay' : max_s}) #關聯學歷與薪資
data2 = my_df2.groupby('education').mean()['min_salay'].plot(kind='line')
plt.show()
def get_edu(list):
education2 = {}
for i in set(list):
education2[i] = list.count(i)
return education2
dir1 = get_edu(education)
# print(dir1)
attr= dir1.keys()
value = dir1.values()
pie = Pie("學歷要求")
pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype='radius',
is_legend_show=False, is_label_show=True,legend_orient='vertical')
pie.render('學歷要求玫瑰圖.html')
value = dir1.values()
pie = Pie("學歷要求")
pie.add("", attr, value, center=[50, 50], is_random=False, radius=[30, 75], rosetype='radius',
is_legend_show=False, is_label_show=True,legend_orient='vertical')
pie.render('學歷要求玫瑰圖.html')
總結:
1.經過對主題數據的分析與可視化,可以得到哪些結論?
數據可視化可以讓我們對網頁的內容更清晰,更直觀。
2.小結
經過這段時間的學習,我認識到學Python太難了,由於英語不扎實,經常要查找英語單詞,在find_all上徘徊了很久,運行不了,最后還是沒搞懂,今后需要更多時間投入。