python 招聘數據分析

本文轉載自查看原文 2020-07-09 01:18 550 數據分析

導入包

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

讀文件

df=pd.read_csv(r'C:\Users\MSI\Desktop\1.csv')

查看數據

df.head()

查看基本信息

df.info()

一共有九個字段，22739條數據，數據全為字符串，不存在數據為空的情況，因此不需要進行對缺少數據的處理

對重復數據進行處理，刪除職位和公司重復值

df.drop_duplicates(['PositionName','CompanyName'],keep='first', inplace=True)

查看處理后的信息

df.info()

剩余21851條記錄

查看薪資的分布的頻率，發現面議有較大的比重

df['Salary'].str[0:].value_counts(normalize = True)

自定義函數drops，刪除薪資中的面議

def drops(col, tag): 
    df.drop(df[df[col].str.contains(tag)].index, inplace=True)
drops('Salary', '面議')

自定義函數cutWord求平均薪資

def cutWord(word,method):
    position=word.find("-")
    length = len(word)
    if position != -1:
        bottomSalary = word[:position]
        topSalary = word[position + 1:length - 1]
    if method == 'bottom':
        return bottomSalary
    else:
        return topSalary

df['topSalary']=df.Salary.apply(cutWord,method='top')
df['bottomSalary']=df.Salary.apply(cutWord,method='bottom')

df.topSalary=df.topSalary.astype("int")
df.bottomSalary=df.bottomSalary.astype("int")

df['avgSalary']=df.apply(lambda x:(x.bottomSalary+x.topSalary)/2,axis=1)

df['avgSalary'].value_counts

由於各個僅統計各個省份，但所給數據中含有地級市及區等，因此對數據進行處理，僅保留省份/直轄市

自定義函數newCity

def newCity(city):
    if(len(str(city))>2):
        newcity = city[:2]
    else:
        newcity=city
    return newcity

df['newcity']=df.City.apply(newCity)

數據基本處理完成，保存為df_clean

df_clean = df[["PositionName", "CompanyName", "newcity", "Experience", "JobWords", "avgSalary"]]
df_clean.head()

查看數據的描述性信息

print(df_clean.describe())

平均薪資：21.85W，中位數：19W，最高：177.5W

薪資分布情況圖

plt.rcParams['font.sans-serif']=['SimHei']
df_clean.avgSalary.hist(bins=20)
plt.show()

分割experience，不知道為什么這里分割了八個出來，我就定義了8列。不太懂我覺的這里應該四列才對,8列弄出來之后再把多的刪了

info_split=df_clean['Experience'].str.split(' ',expand=True)
info_split.columns=['education','experience','language','age','1','2','3','4']

newExp=info_split.drop(['1','2','3','4'],axis=1)

display(newExp)

display(df_clean)

然后把兩個二維表進行鏈接，再保存為new_df，最開始是鏈接之后刪除experience，但是不知道為什么鏈接之后刪除newcity就變成了city，之前的city白處理了。然后就直接保存了

newDF=pd.concat([df_clean, newExp], axis=1)

new_df = newDF[["PositionName", "CompanyName", "newcity",'education','experience','language','age' , "JobWords", "avgSalary"]]

display(new_df)

轉換分類數據，這里發現本科有兩個，然后其他數據不是很直觀，后續有對這個數據進行了處理

new_df.education.astype('category')

自定義newEdu處理教育水平，寫的有點復雜，之前的寫法不知道為什么最后的結構只剩下本科和碩士。

def newEdu(education):
    if education == "碩士及以上":
        new_edu = "碩士"
    elif education == "統招本科":
        new_edu = "本科"
    elif education == "本科及以上":
        new_edu = "本科" 
    elif education== "學歷不限":
        new_edu = "不限"
    elif education== "大專及以上":
        new_edu = "大專"
    elif education == "中專/中技及以上":
        new_edu = "中專"
    else:
        new_edu="博士"
    return new_edu

new_df['new_edu'] = new_df.education.apply(newEdu)

new_df.new_edu.astype('category')

選用線箱進行比較。其最大的優點就是不受異常值的影響，可以以一種相對穩定的方式描述數據的平均水平、波動程度和異常值分布情況。

new_df.new_edu=new_df.new_edu.astype('category')
new_df.new_edu.cat.set_categories(["中專", "博士", "大專", "不限", "本科", "碩士", ],inplace=True)
ax=new_df.boxplot(column='avgSalary',by='new_edu',figsize=(9,6))

print(new_df.groupby(new_df.new_edu).avgSalary.mean().sort_values(ascending=False))

如圖1，本科中位數薪資高於碩士生，容易誤以為本科薪資高於碩士生，但同時結合圖2，可見碩士生的平均薪資水平遠高於本科生，由此可知，學歷越高，薪資越高，知識改變命運。

轉化數據類型（工作年限）創建線箱進行比較

new_df.experience.astype('category')

new_df.boxplot(column='avgSalary',by='experience',figsize=(9,6))

工作年限和薪資的比較

print(new_df.groupby(new_df.experience).avgSalary.mean().sort_values(ascending=False))

薪資與工作年限有很大關系，但優秀員工薪資明顯超越年限限制。

北京和上海這兩座城市，學歷對薪資的影響

df_sz_bj=new_df[new_df['newcity'].isin(['上海','北京'])]
df_sz_bj.boxplot(column='avgSalary',by=['new_edu','newcity'],figsize=[14,6])
plt.show()

薪資與工作區域有很大關系，北京薪資不管什么學歷都高於同等學歷的薪資狀況

不同城市，招聘數據分析需求前五的公司
自定義了函數topN，將傳入的數據計數，並且從大到小返回前五的數據。然后以newcity聚合分組，因為求的是前5的公司，所以對CompanyName調用topN函數。

new_df.groupby('CompanyName').avgSalary.agg(lambda x:max(x)-min(x))

def topN(df,n=5):
    counts=df.value_counts()
    return counts.sort_values(ascending=False)[:n]

print(new_df.groupby('newcity').CompanyName.apply(topN))

職位需求的前五，以計算機行業為主

print(new_df.groupby('newcity').PositionName.apply(topN))

將上海和北京的薪資數據以直方圖的形式進行對比

plt.hist(x=new_df[new_df.newcity=='上海'].avgSalary,
         bins=15,
         density=1,
         facecolor='blue',
         alpha=0.5)
plt.hist(x=new_df[new_df.newcity=='北京'].avgSalary,
         bins=15,
         density=1,
         facecolor='red',
         alpha=0.5)
plt.show()

做一個所需要做的工作的詞雲，先下載wordcloud庫
在anaconda下載第三方庫還挺麻煩的，鏡像還不能用，只能下載之后導包
查看數據進行處理

print(new_df.JobWords)

重置索引然后作詞雲

df_word_counts=df_word.unstack().dropna().reset_index().groupby('level_0').count()

from wordcloud import WordCloud
df_word_counts.index=df_word_counts.index.str.replace("'","")

wc=WordCloud(font_path=r'C:\Windows\Fonts\FZSTK.TTF',width=900,height=400,background_color='white')
fig,ax=plt.subplots(figsize=(20,15))
wc.fit_words(df_word_counts.level_1)
ax=plt.imshow(wc)
plt.axis('off')
plt.show()

上圖可見對統計分析，數學，英語和office使用還是有一定的要求。

完整代碼

#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df=pd.read_csv(r'C:\Users\MSI\Desktop\1.csv')

df.head()

df.info()

df.drop_duplicates(['PositionName','CompanyName'],keep='first', inplace=True)

df.info()

df['Salary'].str[0:].value_counts(normalize = True)

def drops(col, tag): 
    df.drop(df[df[col].str.contains(tag)].index, inplace=True)

drops('Salary', '面議')

df['Salary'].str[0:].value_counts(normalize = True)

def cutWord(word,method):
    position=word.find("-")
    length = len(word)
    if position != -1:
        bottomSalary = word[:position]
        topSalary = word[position + 1:length - 1]
    if method == 'bottom':
        return bottomSalary
    else:
        return topSalary

df['topSalary']=df.Salary.apply(cutWord,method='top')
df['bottomSalary']=df.Salary.apply(cutWord,method='bottom')

df.topSalary=df.topSalary.astype("int")
df.bottomSalary=df.bottomSalary.astype("int")

df['avgSalary']=df.apply(lambda x:(x.bottomSalary+x.topSalary)/2,axis=1)

df['avgSalary'].value_counts

def newCity(city):
    if(len(str(city))>2):
        newcity = city[:2]
    else:
        newcity=city
    return newcity

df['newcity']=df.City.apply(newCity)

df_clean = df[["PositionName", "CompanyName", "newcity", "Experience", "JobWords", "avgSalary"]]
df_clean.head()

print(df_clean.describe())

plt.rcParams['font.sans-serif']=['SimHei']
df_clean.avgSalary.hist(bins=20)
plt.show()

info_split=df_clean['Experience'].str.split(' ',expand=True)
info_split.columns=['education','experience','language','age','1','2','3','4']

newExp=info_split.drop(['1','2','3','4'],axis=1)

display(newExp)

display(df_clean)

newDF=pd.concat([df_clean, newExp], axis=1)

new_df = newDF[["PositionName", "CompanyName", "newcity",'education','experience','language','age' , "JobWords", "avgSalary"]]

display(new_df)

new_df.education.astype('category')

def newEdu(education):
    if education == "碩士及以上":
        new_edu = "碩士"
    elif education == "統招本科":
        new_edu = "本科"
    elif education == "本科及以上":
        new_edu = "本科" 
    elif education== "學歷不限":
        new_edu = "不限"
    elif education== "大專及以上":
        new_edu = "大專"
    elif education == "中專/中技及以上":
        new_edu = "中專"
    else:
        new_edu="博士"
    return new_edu

new_df['new_edu'] = new_df.education.apply(newEdu)

new_df.new_edu.astype('category')

new_df.new_edu=new_df.new_edu.astype('category')
new_df.new_edu.cat.set_categories(["中專", "博士", "大專", "不限", "本科", "碩士", ],inplace=True)
ax=new_df.boxplot(column='avgSalary',by='new_edu',figsize=(9,6))

print(new_df.groupby(new_df.new_edu).avgSalary.mean().sort_values(ascending=False))

new_df.experience.astype('category')

new_df.boxplot(column='avgSalary',by='experience',figsize=(9,6))

print(new_df.groupby(new_df.experience).avgSalary.mean().sort_values(ascending=False))

df_sz_bj=new_df[new_df['newcity'].isin(['上海','北京'])]
df_sz_bj.boxplot(column='avgSalary',by=['new_edu','newcity'],figsize=[14,6])
plt.show()

new_df.groupby('CompanyName').avgSalary.agg(lambda x:max(x)-min(x))

def topN(df,n=5):
    counts=df.value_counts()
    return counts.sort_values(ascending=False)[:n]

print(new_df.groupby('newcity').CompanyName.apply(topN))

print(new_df.groupby('newcity').PositionName.apply(topN))

plt.hist(x=new_df[new_df.newcity=='上海'].avgSalary,
         bins=15,
         density=1,
         facecolor='blue',
         alpha=0.5)
plt.hist(x=new_df[new_df.newcity=='北京'].avgSalary,
         bins=15,
         density=1,
         facecolor='red',
         alpha=0.5)
plt.show()

print(new_df.JobWords)

df_word_counts=df_word.unstack().dropna().reset_index().groupby('level_0').count()

from wordcloud import WordCloud
df_word_counts.index=df_word_counts.index.str.replace("'","")

wc=WordCloud(font_path=r'C:\Windows\Fonts\FZSTK.TTF',width=900,height=400,background_color='white')
fig,ax=plt.subplots(figsize=(20,15))
wc.fit_words(df_word_counts.level_1)
ax=plt.imshow(wc)
plt.axis('off')
plt.show()

參考資料：
https://www.jianshu.com/p/1e1081ca13b5

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python數據分析（6）----招聘信息數據分析數據分析崗位招聘情況【Python數據分析】可視化圖表分析拉鈎網招聘數據數據分析師招聘分析2.0 Python--智聯招聘網站的數據分析 Python數據分析 Python數據分析 Python之數據分析 python數據分析與展示（一） python數據分析畫圖體驗