前程無憂數據采集與分析

本文轉載自查看原文 2020-04-22 21:11 930

一.網絡爬蟲設計方案：

1.主題網絡爬蟲名稱：51job 招聘網站信息數據采集

2.主題網絡爬蟲爬取的內容：采集python崗位薪資，職位，城市，學歷等信息

3.主題式網絡爬蟲設計方案概述：進入網站搜索python並勾選對應學歷，確定網址url后翻頁獲取每一頁的html代碼並解析出對應數據，期間進行數據清洗，將不規范數據從源頭去除，然后保存至字典，再利用 xlsxwriter 模塊存入excel表格，最后進行數據可視化處理，繪制各城市薪資占比，各學歷崗位熱度，各學歷薪資分布等信息圖

二.主題頁面的結構特征分析：

1.主題頁面結構特征分析：每一條對應的崗位信息都在class='el' 的div標簽中，我們可使用xpath解析出每一個div再對每一條崗位進行解析，這樣可以避免結構不同所帶來的數據不精准現象。

2.頁面解析

三：網絡爬蟲程序設計：

1.數據采集

import requests
from lxml import etree
from xlsxwriter import Workbook

# urls_0*列表存放的網址是通過修改url參數page實現的，循環100次即得到前100頁的url，這里我們爬取四種學歷對應的所以頁數

# 博士學歷，06
urls_06 = []
for page in range(1, 5):
    urls_06.append("https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=06&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(page))

"https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
"https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="

# 碩士學歷，05
urls_05 = []
for page in range(1, 59):
    urls_05.append("https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=05&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(page))


# 本科學歷，04
urls_04 = []
for page in range(1, 407):
    urls_04.append("https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(page))


# 專科學歷，03
urls_03 = []
for page in range(1, 103):
    urls_03.append("https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(page))


# 定義獲取數據的函數
def get_data(tablename):
    # 聲明四種學歷，判斷傳入的tablename是哪種學歷，對應取出urls
    educ = ['專科', '本科', '碩士', '博士']
    url_infos = [urls_03, urls_04, urls_05, urls_06]
    urls = []
    # 遍歷判斷出正確的學歷目標url信息
    for k, i in enumerate(educ):
        if tablename == i:
            urls = url_infos[k]

    # infos用來存儲所有信息
    infos = []

    # 取出一頁的鏈接逐條爬取
    for url in urls:
        
        # 打印鏈接
        print(url)

        # 獲取頁面源碼
        con = requests.get(url).content.decode("gbk")
        
        # 創建xpath對象解析頁面
        xp = etree.HTML(con)
        
        # 因為在招聘平台上有個別公司沒有給出薪資，所以我們不可以直接去所有信息，需要逐條判斷
        html = xp.xpath("//div[@class='dw_table']//div[@class='el']")
        
        # 取出單條信息對應的xpath對象
        for h in html:
            
            # 解析出崗位名稱
            title = h.xpath('p/span/a/text()')
            if title:
                
                # 將多余空格字符去除
                title = title[0].replace(' ', '')
                
            # 取出公司名稱
            gs_name = h.xpath('span[@class="t2"]/a/text()')[0]
            
            # 取出工作地址
            work_address = h.xpath('span[@class="t3"]/text()')[0]
            
            # 取出薪資
            money = h.xpath('span[@class="t4"]/text()')
            
            # 判斷該崗位是否開出薪資范圍，如果有則取出，沒有則用暫無提示
            if money:
                money = money[0]
            else:
                
                # money = '暫無'
                # 如果該公司沒有給出明確的薪資范圍則不在我們爬取的目標中，跳過此層循環
                continue
                
            # 將一個公司的對應信息存入infos列表中
            infos.append({
                '職位': title,
                '公司名': gs_name,
                '地址': work_address,
                '薪資': money
            })

    # 將信息列表infos存入excel表格，表格名稱即tablename
    players = infos
    ordered_list = ["職位", "公司名", "地址", "薪資"]

    wb = Workbook("./excels/%s.xlsx" % tablename)
    ws = wb.add_worksheet("New Sheet")

    first_row = 0
    for header in ordered_list:
        col = ordered_list.index(header)
        ws.write(first_row, col, header)

    row = 1
    for player in players:
        for _key, _value in player.items():
            col = ordered_list.index(_key)
            ws.write(row, col, _value)
        row += 1
    wb.close()


# 定義列表存入四種學歷
educ = ['專科', '本科', '碩士', '博士']

for v in educ:
    
    # 取出一條學歷傳入get_data函數
    get_data(v)

保存成功：

2.數據清洗和處理

為減少代碼冗余，我們在采集過程中就引入了數據的處理和清洗

3.數據分析與可視化

(1)地區熱度詞雲圖

import io
import sys

import jieba
from matplotlib import pyplot as plt
import matplotlib as mpl
from wordcloud import WordCloud
from xlrd import open_workbook

f = ''

for filename in ['專科','本科','碩士','博士']:

    mpl.rcParams['font.sans-serif'] = ['KaiTi']

    mpl.rcParams['font.serif'] = ['KaiTi']

    # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

    workbook = open_workbook(r'./excels/{}.xlsx'.format(filename))  # 打開xls文件

    sheet_name= workbook.sheet_names()  # 打印所有sheet名稱，是個列表

    sheet = workbook.sheet_by_index(0)  # 根據sheet索引讀取sheet中的所有內容

    content = sheet.col_values(2)[0:]  # 第3列內容

    for i in content:

        f += i[:2]


# 結巴分詞，生成字符串，wordcloud無法直接生成正確的中文詞雲

cut_text = " ".join(jieba.cut(f))

wordcloud = WordCloud(

    # 設置字體，不然會出現口字亂碼，文字的路徑是電腦的字體一般路徑，可以換成別的

    font_path="C:/Windows/Fonts/simfang.ttf",

    # 設置了背景，寬高

    background_color="white", width=2000, height=1200).generate(cut_text)

plt.imshow(wordcloud, interpolation="bilinear")

plt.axis("off")

plt.savefig('地區熱度詞雲圖.png')

效果：

(2) 崗位數量折線圖示

import io
import sys

from matplotlib import pyplot as plt
import matplotlib as mpl
from xlrd import open_workbook

nums_list = []
for i in ['專科','本科','碩士','博士']:
    filename = i

    mpl.rcParams['font.sans-serif'] = ['KaiTi']
    mpl.rcParams['font.serif'] = ['KaiTi']
    # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

    workbook = open_workbook(r'./excels/{}.xlsx'.format(filename))  # 打開xls文件
    sheet_name= workbook.sheet_names()  # 打印所有sheet名稱，是個列表
    sheet = workbook.sheet_by_index(0)  # 根據sheet索引讀取sheet中的所有內容
    content = sheet.col_values(0)[1:]  # 第1列內容
    nums_list.append(len(content))


input_values = ['專科','本科','碩士','博士']
squares = nums_list


# 生成折現圖
plt.plot(input_values, squares, linewidth=2,)  # 調用繪制函數，傳入輸入參數和輸出參數
plt.title("python崗位數量分布", fontsize=24)  # 指定標題，並設置標題字體大小
plt.xlabel("學歷", fontsize=14)  # 指定X坐標軸的標簽，並設置標簽字體大小
plt.ylabel("崗位數量", fontsize=14)  # 指定Y坐標軸的標簽，並設置標簽字體大小
plt.tick_params(axis='both', labelsize=14)  # 參數axis值為both，代表要設置橫縱的刻度標記，標記大小為14
plt.savefig("./崗位數量折線圖示.png")  # 打開matplotlib查看器，並保存繪制的圖形

（3）各學歷對應職位占比

import io
import sys

from matplotlib import pyplot as plt
import matplotlib as mpl
from xlrd import open_workbook

filename = '專科'     # 替換學歷名稱運行程序即可得到四張圖片

mpl.rcParams['font.sans-serif'] = ['KaiTi']

mpl.rcParams['font.serif'] = ['KaiTi']

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

workbook = open_workbook(r'./excels/{}.xlsx'.format(filename))  # 打開xls文件

sheet_name= workbook.sheet_names()  # 打印所有sheet名稱，是個列表

sheet = workbook.sheet_by_index(0)  # 根據sheet索引讀取sheet中的所有內容

content = sheet.col_values(0)[1:]  # 第1列內容

new_con = []

for i in content:

    new_con.append(i.replace('\r','').replace('\n',''))

nums = []

temp = {}

for m in new_con:

    # 循環判斷薪資是否在nums列表中
    if m not in nums:

        # 不在則添加一個值為該薪資的鍵
        nums.append(m)
        temp[m] = 0
    if m in nums:

        # 在則將該建對應的值加1
        temp[m] = temp[m] + 1

# 利用sorted函數對該字典排序

new_title = sorted(temp.items(), key=lambda item: item[1], reverse=True)[0:6]

nums = []

input_values = []

for i in new_title:

    nums.append(i[1])

    input_values.append(i[0])

sum_nums = sum(nums)

squares = [x/sum_nums for x in nums]

print(input_values)

print(squares)


# 保證圓形
plt.axes(aspect=1)
plt.pie(x=squares, labels=input_values, autopct='%3.1f %%')
plt.savefig("./崗位熱度圖示/{}崗位熱度餅狀圖示.png".format(filename))

（4）各學歷薪資狀況

import io
import sys

from matplotlib import pyplot as plt
import matplotlib as mpl
from xlrd import open_workbook

filename = '專科'     # 替換學歷名稱運行程序即可得到四張圖片

mpl.rcParams['font.sans-serif'] = ['KaiTi']
mpl.rcParams['font.serif'] = ['KaiTi']
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

workbook = open_workbook(r'./excels/{}.xlsx'.format(filename))  # 打開xls文件
sheet_name= workbook.sheet_names()  # 打印所有sheet名稱，是個列表
sheet = workbook.sheet_by_index(0)  # 根據sheet索引讀取sheet中的所有內容
content = sheet.col_values(3)[0:]  # 第4列內容

moneys = []
for i in content:
    if '月' in i:
        moneys.append(i[:-2])

nums = []   # num為臨時變量，在下方提供判定作用


# money_num用來存儲薪資范圍最普遍的值
money_num = {}


for m in moneys:

    # 循環判斷薪資是否在nums列表中
    if m not in nums:

        # 不在則添加一個值為該薪資的鍵
        nums.append(m)
        money_num[m] = 0
    if m in nums:

        # 在則將該建對應的值加1
        money_num[m] = money_num[m] + 1

# 利用sorted函數對該字典排序

new_money_nums = sorted(money_num.items(), key=lambda item: item[1], reverse=True)

x = []
y = []

for i in new_money_nums[0:8]:
    x.append(i[0])
    y.append(i[1])


plt.bar(x, y, align='center')

plt.title('python')
plt.ylabel('人數')
plt.xlabel('薪資')


plt.savefig("./薪資范圍圖示/{}薪資分布柱狀圖.png".format(filename))

（5）代碼匯總

四：結論：

從分析記過可知要想得到更好的生活，就要好好學習哈哈哈，更要明白難的知識才會更有含金量，如果不刻苦學習將來可能只是初級運維師，越努力越幸運。

這次項目也讓我感受到了python的魅力，成就高頗豐，讓我對后期課程更加感興趣。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 前程無憂——數據分析崗位爬取前程無憂爬蟲源碼及分析（一）前程無憂數據爬取用戶行為分析之離線數據采集 51job 數據采集和分析項目中的數據采集分析需求前程無憂崗位數據爬取+Tableau可視化分析使用Python爬取、清洗並分析前程無憂的大數據職位數據采集：埋點、采集、存儲及分析 Python網絡爬蟲——前程無憂網數據爬取及可視化分析