-
爬取前程無憂的數據(大數據職位)
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Nov 1 14:47:27 2019 4 5 @author: loo 6 """ 7 8 9 import scrapy 10 import csv 11 from scrapy.crawler import CrawlerProcess 12 13 14 class MySpider(scrapy.Spider): 15 name = "spider" 16 17 def __init__(self): 18 # 保存為CSV文件操作 19 self.f = open('crawl_51jobs.csv', 'wt', newline='', encoding='GBK', errors='ignore') 20 self.writer = csv.writer(self.f) 21 '''title,locality,salary,companyName,releaseTime''' 22 self.writer.writerow(('職位', '公司地區','薪資', '公司名稱', '發布時間')) 23 24 # 設置待爬取網站列表 25 self.urls = [] 26 # 設置搜索工作的關鍵字 key 27 key = '大數據' 28 print("關鍵字:", key) 29 # 設置需要爬取的網頁地址 30 for i in range(1,200): 31 f_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,' + key + ',2,' + str(i) + '.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' 32 self.urls.append(f_url) 33 # print(self.urls) 34 35 36 def start_requests(self): 37 # self.init_urls() 38 for url in self.urls: 39 yield scrapy.Request(url=url, callback=self.parse) 40 # parse方法會在每個request收到response之后調用 41 42 43 def parse(self, response): 44 # 提取工作列表 45 jobs = response.xpath('//*[@id="resultList"]/div[@class="el"]') 46 # print(jobs) 47 48 for job in jobs: 49 # 工作職位 50 title = job.xpath('p/span/a/text()').extract_first().strip() 51 # 工作地區 52 locality = job.xpath('span[2]/text()').extract_first() 53 # 薪資 54 salary = job.xpath('span[3]/text()').extract_first() 55 # 公司名稱 56 companyName = job.xpath('span[1]/a/text()').extract_first().strip() 57 # 發布時間 58 releaseTime = job.xpath('span[4]/text()').extract_first() 59 60 print(title, locality, salary, companyName, releaseTime) 61 # 保存數據 62 self.writer.writerow((title, locality, salary, companyName, releaseTime)) 63 # print("over: " + response.url) 64 65 66 def main(): 67 process = CrawlerProcess({ 68 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 69 }) 70 71 process.crawl(MySpider) 72 process.start() # 這句代碼就是開始了整個爬蟲過程 ,會輸出一大堆信息,可以無視 73 74 75 if __name__=='__main__': 76 main()
-
爬取后的數據保存到CSV文件中(如下圖)
-
可以在文件中觀察數據的特點
- 薪資單位不一樣
-
公司地區模式不一樣(有的為城市,有的是城市-地區)
-
有職位信息的空白
-
清洗數據
根據CSV文件中信息的特點進行數據清洗
- 將公司位置從區域改為公司城市:地區取到城市,把區域去掉。如“上海-浦東”轉化為“上海”
- 薪資規范化(源數據有的是千/月,有的是萬/月):統一單位(千元/月),並且將薪資范圍拆分為最低薪資和最高薪資。如將“4-6千/月”轉化為:最低薪資為4,最高薪資為6
- 刪除含有空值的行(有的崗位信息的工作地點、薪資等可能為空,需要刪除,便於后面分析)和公司地區為“異地招聘”的行
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Nov 1 14:47:27 2019 4 5 @author: loo 6 """ 7 8 9 import re 10 import csv 11 import numpy as np 12 13 14 def salaryCleaning(salary): 15 """ 16 統一薪資的單位:(千元/月); 17 將薪資范圍拆分為最低薪資和最高薪資 18 """ 19 minSa, maxSa = [], [] 20 for sa in salary: 21 if sa: 22 if '-'in sa: # 針對1-2萬/月或者10-20萬/年的情況,包含- 23 minSalary=re.findall(re.compile('(\d*\.?\d+)'),sa)[0] 24 maxSalary=re.findall(re.compile('(\d?\.?\d+)'),sa)[1] 25 if u'萬' in sa and u'年' in sa: # 單位統一成千/月的形式 26 minSalary = float(minSalary) / 12 * 10 27 maxSalary = float(maxSalary) / 12 * 10 28 elif u'萬' in sa and u'月' in sa: 29 minSalary = float(minSalary) * 10 30 maxSalary = float(maxSalary) * 10 31 else: # 針對20萬以上/年和100元/天這種情況,不包含-,取最低工資,沒有最高工資 32 minSalary = re.findall(re.compile('(\d*\.?\d+)'), sa)[0] 33 maxSalary="" 34 if u'萬' in sa and u'年' in sa: # 單位統一成千/月的形式 35 minSalary = float(minSalary) / 12 * 10 36 elif u'萬' in sa and u'月' in sa: 37 minSalary = float(minSalary) * 10 38 elif u'元'in sa and u'天'in sa: 39 minSalary=float(minSalary)/1000*21 # 每月工作日21天 40 else: 41 minSalary = ""; maxSalary = ""; 42 43 minSa.append(minSalary); maxSa.append(maxSalary) 44 return minSa,maxSa 45 46 47 def locFormat(locality): 48 """ 49 將“地區-區域”轉化為“地區” 50 """ 51 newLocality = [] 52 for loc in locality: 53 if '-'in loc: # 針對有區域的情況,包含- 54 newLoc = re.findall(re.compile('(\w*)-'),loc)[0] 55 else: # 針對沒有區域的情況 56 newLoc = loc 57 newLocality.append(newLoc) 58 return newLocality 59 60 61 def readFile(): 62 """ 63 讀取源文件 64 """ 65 data = [] 66 with open("crawl_51jobs.csv",encoding='gbk') as f: 67 csv_reader = csv.reader(f) # 使用csv.reader讀取f中的文件 68 data_header = next(csv_reader) # 讀取第一行每一列的標題 69 for row in csv_reader: # 將csv 文件中的數據保存到data中 70 data.append(row) 71 72 nd_data = np.array(data) # 將list數組轉化成array數組便於查看數據結構 73 jobName = nd_data[:, 0] 74 locality = nd_data[:, 1] 75 salary = nd_data[:, 2] 76 companyName = nd_data[:, 3] 77 releaseTime = nd_data[:, 4] 78 return jobName, locality, salary, companyName, releaseTime 79 80 def saveNewFile(jobName, newLocality, minSa, maxSa, companyName, releaseTime): 81 """ 82 將清洗后的數據寫入新文件 83 """ 84 new_f = open('cleaned_51jobs.csv', 'wt', newline='', encoding='GBK', errors='ignore') 85 writer = csv.writer(new_f) 86 writer.writerow(('職位', '公司城市','最低薪資(千/月)','最高薪資(千/月)', '公司名稱', '發布時間')) 87 88 num = 0 89 while True: 90 try: # 所有數據都寫入文件后,退出循環 91 if newLocality[num] and minSa[num] and maxSa[num] and companyName[num] and newLocality[num]!="異地招聘": # 當有空值時或者公司地點為異地招聘時不存入清洗后文件 92 writer.writerow((jobName[num], newLocality[num], minSa[num], maxSa[num], companyName[num], releaseTime[num])) 93 num += 1 94 except Exception: 95 break 96 97 98 def main(): 99 """ 100 主函數 101 """ 102 # 獲取源數據 103 jobName, locality, salary, companyName, releaseTime = readFile() 104 105 # 清洗源數據中的公司地區和薪資 106 newLocality = locFormat(locality) 107 minSa, maxSa = salaryCleaning(salary) 108 109 # 將清洗后的數據存入CSV文件 110 saveNewFile(jobName, newLocality, minSa, maxSa, companyName, releaseTime) 111 112 113 if __name__ == '__main__': 114 main()
-
可視化並分析數據
-
-
職位數前20名的城市以及平均薪資前20的城市
-
大數據崗位的職稱情況
-
大數據崗位的城市分布情況
-
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Nov 1 20:15:56 2019 4 5 @author: loo 6 """ 7 8 import matplotlib.pyplot as plt 9 import csv 10 import numpy as np 11 import re 12 from wordcloud import WordCloud,STOPWORDS 13 14 15 def readFile(): 16 """ 17 讀取清洗后的文件 18 """ 19 data = [] 20 with open("cleaned_51jobs.csv",encoding='gbk') as f: 21 csv_reader = csv.reader(f) # 使用csv.reader讀取f中的文件 22 data_header = next(csv_reader) # 讀取第一行每一列的標題 23 for row in csv_reader: # 將csv文件中的數據保存到data中 24 data.append(row) 25 26 nd_data = np.array(data) # 將list數組轉化成array數組便於查看數據結構 27 jobName = nd_data[:, 0] 28 locality = nd_data[:, 1] 29 minSalary = nd_data[:, 2] 30 maxSalary = nd_data[:, 3] 31 return data, jobName, locality, minSalary, maxSalary 32 33 34 35 def salary_locality(data): 36 """ 37 計算城市對應的職位數和平均薪資,並打印 38 """ 39 city_num = dict() 40 41 for job in data: 42 loc, minSa, maxSa = job[1], float(job[2]), float(job[3]) 43 if loc not in city_num: 44 avg_salary = minSa*maxSa/2 45 city_num[loc] = (1, avg_salary) 46 else: 47 num = city_num[loc][0] 48 avg_salary = (minSa*maxSa/2 + num * city_num[loc][1])/(num+1) 49 city_num[loc] = (num+1, avg_salary) 50 51 # 將其按職位數降序排列 52 title_sorted = sorted(city_num.items(), key=lambda x:x[1], reverse=True) 53 title_sorted = dict(title_sorted) 54 55 # 將其按平均薪資降序排列 56 salary_sorted = sorted(city_num.items(), key=lambda x:x[1][1], reverse=True) 57 salary_sorted = dict(salary_sorted) 58 59 60 allCity1, allCity2, allNum, allAvg = [], [], [], [] 61 i, j = 1, 1 62 # 取職位數前20 63 for city in title_sorted: 64 if i<=20: 65 allCity1.append(city) 66 allNum.append(title_sorted[city][0]) 67 i += 1 68 69 # 取平均薪資前20 70 for city in salary_sorted: 71 if j<=20: 72 allCity2.append(city) 73 allAvg.append(salary_sorted[city][1]) 74 j += 1 75 76 #解決中文顯示問題 77 plt.rcParams['font.sans-serif']=['SimHei'] 78 plt.rcParams['axes.unicode_minus'] = False 79 80 # 柱狀圖在橫坐標上的位置 81 x = np.arange(20) 82 83 # 設置圖的大小 84 plt.figure(figsize=(13, 11)) 85 86 # 列出你要顯示的數據,數據的列表長度與x長度相同 87 y1 = allNum 88 y2 = allAvg 89 90 bar_width=0.8 # 設置柱狀圖的寬度 91 tick_label1 = allCity1 92 tick_label2 = allCity2 93 94 95 # 繪制柱狀圖 96 plt.subplot(211) 97 plt.title('51job——大數據職位數前20名城市') 98 plt.xlabel(u"城市") 99 plt.ylabel(u"職位數") 100 plt.xticks(x,tick_label1) # 顯示x坐標軸的標簽,即tick_label 101 plt.bar(x,y1,bar_width,color='salmon') 102 103 plt.subplot(212) 104 plt.title('51job——大數據職位平均薪資的前20名城市') 105 plt.xlabel(u"城市") 106 plt.ylabel(u"平均薪資(千元/月)") 107 plt.xticks(x,tick_label2) # 顯示x坐標軸的標簽,即tick_label 108 plt.bar(x,y2,bar_width,color='orchid') 109 110 plt.legend() # 顯示圖例,即label 111 # plt.savefig('city.jpg', dpi=500) # 指定像素保存 112 plt.show() 113 114 115 def jobTitle(jobName): 116 117 word="".join(jobName); 118 119 # 圖片模板和字體 120 # image=np.array(Image.open('model.jpg'))#顯示中文的關鍵步驟 121 font='simkai.ttf' 122 123 # 去掉英文,保留中文 124 resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\。\@\#\\\&\*\%\-]", " ",word) 125 # 已經中文和標點符號 126 wl_space_split = resultword 127 # 設置停用詞 128 sw = set(STOPWORDS) 129 sw.add("高提成");sw.add("底薪");sw.add("五險");sw.add("雙休") 130 sw.add("五險一金");sw.add("社保");sw.add("上海");sw.add("廣州") 131 sw.add("無責底薪");sw.add("月薪");sw.add("急聘");sw.add("急招") 132 sw.add("資深");sw.add("包吃住");sw.add("周末雙休");sw.add("代招") 133 sw.add("高薪");sw.add("高底薪");sw.add("校招");sw.add("月均") 134 sw.add("可實習");sw.add("年薪");sw.add("北京");sw.add("經理") 135 sw.add("包住");sw.add("應屆生");sw.add("南京");sw.add("專員") 136 sw.add("提成");sw.add("方向") 137 138 # 關鍵一步 139 my_wordcloud = WordCloud(font_path=font,stopwords=sw,scale=4,background_color='white', 140 max_words = 100,max_font_size = 60,random_state=20).generate(wl_space_split) 141 #顯示生成的詞雲 142 plt.imshow(my_wordcloud) 143 plt.axis("off") 144 plt.show() 145 146 #保存生成的圖片 147 # my_wordcloud.to_file('title.jpg') 148 149 150 def localityWordCloud(locality): 151 font='simkai.ttf' 152 locality = " ".join(locality) 153 154 # 關鍵一步 155 my_wordcloud = WordCloud(font_path=font,scale=4,background_color='white', 156 max_words = 100,max_font_size = 60,random_state=20).generate(locality) 157 158 #顯示生成的詞雲 159 plt.imshow(my_wordcloud) 160 plt.axis("off") 161 plt.show() 162 163 #保存生成的圖片 164 # my_wordcloud.to_file('place.jpg') 165 166 167 def main(): 168 # 得到清洗后的數據數據 169 data, jobName, locality, minSalary, maxSalary = readFile() 170 # 進行分析 171 salary_locality(data) 172 jobTitle(jobName) 173 localityWordCloud(locality) 174 175 176 if __name__ == '__main__': 177 main()
可視化圖片如下
-
結論
1) 職位數排名前三的城市:上海、廣州、深圳
2) 平均薪資排名前三的城市:福建、滁州、三亞
3) 大數據職位需求多的城市其平均薪資不一定高。相反,大數據職位需求多的城市的平均薪資更低,而大數據職位需求少的城市的平均薪資更高。
4) 大數據崗位需求最大的職稱是:開發工程師