一、選題的背景
為什么要選擇此選題?要達到的數據分析的預期目標是什么?
在畢業之際,學生在畢業后並不了解計算機崗位的薪資待遇為了能讓學生能了解計算機行業的最高薪資以及平均薪資讓每個學生更好認清行業的前期。
二、主題是網絡爬蟲設計方案
1、名稱:人才網招聘爬蟲
2、爬取內容的數據與分析特征
此次爬取的內容數據有:工作崗位、公司、薪資、發布日期、招聘信息地址
3、爬蟲設計方案
1.根據網頁的樣式進行目標選取
2.進行目標內容取出
3.內容保存
4.技術難點:主要有文件保存,以及內容取出,反爬機制
三、主題頁面的結構特征分
使用beautifulsoup解析頁面,獲取JS中所需數據
soup.find_all('script')[7]
四、網絡爬蟲程序設計
1.數據爬取與采集
import time import requests from bs4 import BeautifulSoup import os import csv import re def analysis(item,results): pattern = re.compile(item, re.I|re.M) result_list = pattern.findall(results) return result_list def precess(item): return item.replace(',', ' ').replace('\\', '') #構建請求頭 headers = { 'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0' } url_pattern = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=" if not os.path.exists("intro_job.csv"): #創建存儲csv文件存儲數據 file = open('intro_job.csv', "w", encoding="utf-8-sig",newline='') csv_head = csv.writer(file) #表頭 header = ['job','company','place','salary','date','detail_url'] csv_head.writerow(header) file.close()
2.對數據進行清洗和處理
1.數據清洗
import pandas as pd from matplotlib import pyplot as plt import matplotlib.ticker as ticker import numpy as np import math import re #讀取數據 df = pd.read_csv('intro_job.csv', encoding='utf-8-sig',usecols=["job", "company", "place", "salary", "date"]) #將相應字段數據保存至列表中 job_array = df['job'].values company_array = df['company'].values place_array = df['place'].values salary_array = df['salary'].values date_array = df['date'].values #去除無效含有無效字段的數據,同時也需要去除其他列表中對應位置的數據 bool_array = np.ones_like(salary_array,dtype=np.bool) for i in range(len(salary_array)): if isinstance(salary_array[i],float): bool_array[i] = False print(len(job_array)) print(sum(bool_array)) job_array = job_array[bool_array] print(len(job_array)) company_array = company_array[bool_array] place_array = place_array[bool_array] salary_array = salary_array[bool_array] date_array = date_array[bool_array]
2.數據處理
#將工作地點轉換成市,如"上海-浦東" => "上海" place_array_city = [] for place in place_array: if re.findall('-',place): place_array_city.append(place[:place.find('-')]) else: place_array_city.append(place) def calc_money(salary_tmp): if re.findall('千',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find('千')] elif re.findall('萬',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find('萬')] elif re.findall('元',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find('元')] if re.findall('-',salary_tmp): salary_tmp = salary_tmp.split('-') #print(salary_tmp) return (float(salary_tmp[0]) + float(salary_tmp[1])) / 2 else: return float(salary_tmp) def calc_total(salary_tmp): money = calc_money(salary_tmp) if salary_tmp[-1] == '千': money *= 1000 elif salary_tmp[-1] == '萬': money *= 10000 return money def calc_mean(salary): if re.findall('小時',salary): salary_tmp = salary[:-3] else: salary_tmp = salary[:-2] money = calc_total(salary_tmp) if re.findall('年',salary): money /= 12.0 elif re.findall('天',salary): money *= 30 elif re.findall('小時',salary): money = money * 8 * 20 return money #計算平均月薪 salary_array_mean = [] for salary in salary_array: money = calc_mean(salary) salary_array_mean.append(money) #城市中崗位數目字典,如"上海":1300,表示上海有1300個相關崗位 city_dict = {} #城市中崗位薪酬字典,如"上海":10000,表示上海計算機軟件工作崗位總月薪為10000,除以對應城市崗位數即為該城市平均月薪 salary_dict = {} for i in range(len(place_array_city)): if city_dict.get(place_array_city[i]): city_dict[place_array_city[i]]+=1 salary_dict[place_array_city[i]] += salary_array_mean[i] else: city_dict[place_array_city[i]] = 1 salary_dict[place_array_city[i]] = salary_array_mean[i]
4.數據可視化與分析
#全國計算機軟件平均月薪 mean_salary = sum(salary_array_mean)/len(salary_array_mean) #字典排序 d_order=sorted(city_dict.items(),key=lambda x:x[1],reverse=True) #前崗位數量前20平均月薪列表 mean_top_20 = [] #前崗位數量前20列表 city_top_20 = [] for i in d_order[:20]: mean_top_20.append(salary_dict[i[0]]/i[1]) city_top_20.append(i[0]) ax = plt.axes() labels = ax.get_xticklabels() plt.plot(city_top_20,mean_top_20,marker='o',label='各市平均月薪') plt.plot([mean_salary]*20,'--',label='全國平均月薪') plt.setp(labels,rotation=30.) plt.legend() plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.show()
plt.scatter(place_array,salary_array,s=10, marker="o") plt.xlabel("地點") plt.ylabel("薪資") plt.title("薪資分布-散點圖")
5.將以上各部分的代碼匯總,附上完整程序代碼
import time import requests from bs4 import BeautifulSoup import os import csv import re def analysis(item,results): pattern = re.compile(item, re.I|re.M) result_list = pattern.findall(results) return result_list def precess(item): return item.replace(',', ' ').replace('\\', '') #構建請求頭 headers = { 'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0' } url_pattern = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=" if not os.path.exists("intro_job.csv"): #創建存儲csv文件存儲數據 file = open('intro_job.csv', "w", encoding="utf-8-sig",newline='') csv_head = csv.writer(file) #表頭 header = ['job','company','place','salary','date','detail_url'] csv_head.writerow(header) file.close() for i in range(1,2001): #增加時延防止反爬蟲 time.sleep(5) url = url_pattern.format(i) response = requests.get(url=url, headers=headers) #聲明網頁編碼方式,需要根據具體網頁響應情況 response.encoding = 'gbk' response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') #pattern = re.compile(r"engine_search_result'(.*?)'",re.MULTILINE|re.DOTALL) results = str(soup.find_all('script')[7]) job_names = analysis(r'"job_name":"(.*?)"', results) company_names = analysis(r'"company_name":"(.*?)"', results) workarea_texts = analysis(r'"workarea_text":"(.*?)"', results) providesalary_texts = analysis(r'"providesalary_text":"(.*?)"', results) updatedates = analysis(r'"updatedate":"(.*?)"', results) job_hrefs = analysis(r'"job_href":"(.*?)"', results) for i in range(len(job_names)): with open('intro_job.csv', 'a+', encoding='utf-8-sig') as f: f.write(precess(job_names[i]) + ',' + precess(company_names[i]) + ',' + precess(workarea_texts[i]) + ',' + precess(providesalary_texts[i]) + ',' + precess(updatedates[i]) +',' + precess(job_hrefs[i]) + '\n') import pandas as pd from matplotlib import pyplot as plt import matplotlib.ticker as ticker import numpy as np import math import re #讀取數據 df = pd.read_csv('intro_job.csv', encoding='utf-8-sig',usecols=["job", "company", "place", "salary", "date"]) #將相應字段數據保存至列表中 job_array = df['job'].values company_array = df['company'].values place_array = df['place'].values salary_array = df['salary'].values date_array = df['date'].values #去除無效含有無效字段的數據,同時也需要去除其他列表中對應位置的數據 bool_array = np.ones_like(salary_array,dtype=np.bool) for i in range(len(salary_array)): if isinstance(salary_array[i],float): bool_array[i] = False print(len(job_array)) print(sum(bool_array)) job_array = job_array[bool_array] print(len(job_array)) company_array = company_array[bool_array] place_array = place_array[bool_array] salary_array = salary_array[bool_array] date_array = date_array[bool_array] place_array_city = [] for place in place_array: if re.findall('-',place): place_array_city.append(place[:place.find('-')]) else: place_array_city.append(place) def calc_money(salary_tmp): if re.findall('千',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find('千')] elif re.findall('萬',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find('萬')] elif re.findall('元',salary_tmp): salary_tmp=salary_tmp[:salary_tmp.find('元')] if re.findall('-',salary_tmp): salary_tmp = salary_tmp.split('-') #print(salary_tmp) return (float(salary_tmp[0]) + float(salary_tmp[1])) / 2 else: return float(salary_tmp) def calc_total(salary_tmp): money = calc_money(salary_tmp) if salary_tmp[-1] == '千': money *= 1000 elif salary_tmp[-1] == '萬': money *= 10000 return money def calc_mean(salary): if re.findall('小時',salary): salary_tmp = salary[:-3] else: salary_tmp = salary[:-2] money = calc_total(salary_tmp) if re.findall('年',salary): money /= 12.0 elif re.findall('天',salary): money *= 30 elif re.findall('小時',salary): money = money * 8 * 20 return money #計算平均月薪 salary_array_mean = [] for salary in salary_array: money = calc_mean(salary) salary_array_mean.append(money) #城市中崗位數目字典,如"上海":1300,表示上海有1300個相關崗位 city_dict = {} #城市中崗位薪酬字典,如"上海":10000,表示上海計算機軟件工作崗位總月薪為10000,除以對應城市崗位數即為該城市平均月薪 salary_dict = {} for i in range(len(place_array_city)): if city_dict.get(place_array_city[i]): city_dict[place_array_city[i]]+=1 salary_dict[place_array_city[i]] += salary_array_mean[i] else: city_dict[place_array_city[i]] = 1 salary_dict[place_array_city[i]] = salary_array_mean[i] #全國計算機軟件平均月薪 mean_salary = sum(salary_array_mean)/len(salary_array_mean) #字典排序 d_order=sorted(city_dict.items(),key=lambda x:x[1],reverse=True) #前崗位數量前20平均月薪列表 mean_top_20 = [] #前崗位數量前20列表 city_top_20 = [] for i in d_order[:20]: mean_top_20.append(salary_dict[i[0]]/i[1]) city_top_20.append(i[0]) ax = plt.axes() labels = ax.get_xticklabels() plt.plot(city_top_20,mean_top_20,marker='o',label='各市平均月薪') plt.plot([mean_salary]*20,'--',label='全國平均月薪') plt.setp(labels,rotation=30.) plt.legend() plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False plt.show() plt.scatter(place_array,salary_array,s=10, marker="o") plt.xlabel("地點") plt.ylabel("薪資") plt.title("薪資分布-散點圖")
五、總結
本次爬蟲以及數據分析達到初步預期,但得到的對比效果不夠明顯,分析不夠完整,從散點圖上可以看出計算機崗位的提升上線是很高的。
完成本次設計,收獲很多發現了很多不足的地方,對數據的分析和整合很不熟練,也提高了對python的理解。