Python爬取獵聘網的數據進行分析

本文轉載自查看原文 2021-06-21 18:52 347

前言：

一、選題的背景　

　　近年來，越來越多的年輕人在尋找工作這個方面呢的事情上會出現各種問題，而好的工作非常難找，差的工作很多年輕人也不想做，所以我選擇做一份數據分析一下招聘網站上各個工作的情況。

二、項目目標分析

　　本項目是對獵聘網的數據進行爬取分析,主要分析的目標是招聘信息，學歷要求等；

　　分析在獵聘網中尋找的工作招聘信息，薪資以及其他福利待遇，以及對求職者的學歷要求要多高進行分析。

三、網絡爬蟲設計方案

（1）爬蟲名稱：爬取獵聘網的數據進行分析　　

參考書籍：

《Python3網絡爬蟲開發實戰》-崔慶才
《Python數據可視化》-Kirthi Raman
《深入淺出MySQL》

參考文檔：

四、python爬蟲抓取IT類招聘信息的實現

2.1、代碼

  1 mport requests
  2 import lxml
  3 import re
  4 import pymysql
  5 from bs4 import BeautifulSoup
  6 from multiprocessing import Pool
  7 
  8 def getTableName(ID):
  9 """
 10 有些分類標識符ID中帶有MySql數據庫表名不支持的符號，該函數返回合法表名
 11 """
 12 replaceDict={
 13 "Node.JS":"NodeJS",
 14 ".NET":"NET",
 15 "C#":"CC",
 16 "C++":"CPP",
 17 "COCOS2D-X":"COCOS2DX"
 18 }
 19 if ID in replaceDict:
 20 return replaceDict[ID]
 21 else:
 22 return ID
 23 
 24 def parseWage(wage):
 25 """
 26 該函數實現了解析工資字符串wage，如果是'面議'或者其它則返回列表[0,0](代表工資面議）,否則返回
 27 相應工資（數值類型,單位為萬)
 28 """
 29 parsedResult=re.findall('(.*?)-(.*?)萬.*?',wage,re.S)
 30 if not parsedResult:
 31 return [0,0]
 32 else:
 33 return [parsedResult[0][0],parsedResult[0][1]]
 34 
 35 
 36 """
 37 該函數實現判斷某一個表是否在數據庫方案里存在，存在返回True，不存在返回False
 38 """
 39 sql = "show tables;"
 40 cursor.execute(sql)
 41 tables = [cursor.fetchall()]
 42 table_list = re.findall('(\'.*?\')',str(tables))
 43 table_list = [re.sub("'",'',each) for each in table_list]
 44 if table_name in table_list:
 45 return True
 46 else:
 47 return False
 48 def isUrlValid(url):
 49 """
 50 由於在爬蟲運行過程中發現有些類別招聘信息中含有的詳細招聘信息的入口地址在獲取響應的時候會拋出Missing Schema異常，
 51 發現是url中有些是.../job/...（往往是獵聘網自己發布的招聘信息），有些是.../a/...（這類招聘信息通常是代理發布），
 52 導致無法解析，從而使爬蟲運行到此處時停止抓取數據。
 53 該函數實現對代理發布的URL進行過濾，若為代理發布的信息，則跳過該條招聘信息，函數返回False,否則返回True。
 54 """
 55 isValid=re.search('.*?www\.liepin\.com/job/.*?$',url,re.S)
 56 if isValid:
 57 return True
 58 else:
 59 return False
 60 
 61 def getPageHtml(url,headers=None):
 62 """
 63 返回服務器響應頁面的html,不成功返回None
 64 """
 65 if not headers:
 66 headers={
 67 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
 68 }
 69 
 70 try:
 71 response=requests.get(url,headers=headers)
 72 if response.status_code==200:
 73 return response.text
 74 else:
 75 return None
 76 except requests.RequestException as e:
 77 #debug
 78 print('Exception occur in funciton getPageHtml()')
 79 
 80 return None
 81 def getEntry(html):
 82 """
 83 解析Html,該函數為生成器類型，每一次迭代返回某一子項目的入口地址URL和description組成的字典entry
 84 """
 85 
 86 if not html:
 87 #html為None則返回None，無法從該html中解析出子項目入口地址
 88 #debug
 89 print('html is None in function getEntry()')
 90 return None
 91 
 92 soup=BeautifulSoup(html,'lxml')
 93 for items in soup.find_all(name='li'):
 94 for item in items.find_all(name='dd'):
 95 for usefulURL in item.find_all(name='a',attrs={"target":"_blank","rel":"nofollow"}):
 96 yield{
 97 "URL":'https://www.liepin.com'+usefulURL.attrs['href'],
 98 "URL_Description":usefulURL.text
 99 }
100 
101 
102 def getCountryEntry(entry):
103 """
104 entry為子項目地址URL和描述URL_Description組成的字典,該函數實現了從子項目頁面信息中獲取響應，並
105 且最終返回全國子項目地址CountryURL和CountryURLDescription(實際上就是URL_Description)組成的字典
106 """
107 
108 if not entry:
109 #debug
110 print('ERROR in function getCountryEntry:entry is None')
111 return None
112 
113 headers={
114 "Host":"www.liepin.com",
115 "Referer":"https://www.liepin.com/it/",
116 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
117 }
118 
119 countryHtml=getPageHtml(entry['URL'],headers=headers)
120 soup=BeautifulSoup(countryHtml,'lxml')
121 citiesInfo=soup.find(name='dd',attrs={"data-param":"city"})
122 
123 if not citiesInfo:
124 #debug
125 print('ERROR in function getCountryEntry():citiesInfo is None.')
126 return None
127 
128 db=pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider')
129 cursor=db.cursor()
130 if not table_exists(cursor,entry['URL_Description']):
131 createTableSql="""CREATE TABLE IF NOT EXISTS spider.{} like spider.positiondescription;""".format(getTableName(entry['URL_Description']))
132 try:
133 cursor.execute(createTableSql)
134 print('--------------create table %s--------------------' % (entry['URL_Description']))
135 except:
136 print('error in function getCountryEntry():create table failed.')
137 finally:
138 db.close()
139 
140 
141 
142 return {
143 "CountryURL":"https://www.liepin.com"+citiesInfo.find(name='a',attrs={"rel":"nofollow"}).attrs['href'],
144 "CountryURLDescription":entry['URL_Description']
145 }
146 
147 def getCountryEmployeeInfo(CountryEntry):
148 """
149 CountryEntry是getCountryEntry函數返回的由全國招聘信息CountryURL和地址分類描述
150 CountryURLDescription構成的字典,該函數提取出想要的信息
151 """
152 
153 if not CountryEntry:
154 #debug
155 print('ERROR in function getCountryEmpolyeeInfo():CountryEntry is None.')
156 return None
157 
158 db=pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider')
159 cursor=db.cursor()
160 
161 indexOfPage=0
162 theMaxLength=0
163 
164 #遍歷該類招聘信息的每一頁
165 while indexOfPage<=theMaxLength:
166 URL=CountryEntry['CountryURL']+'&curPage='+str(indexOfPage)
167 pageHtml=getPageHtml(URL)
168 soup=BeautifulSoup(pageHtml,'lxml')
169 
170 #提取出該類招聘信息總共的頁數,僅需要提取一次即可
171 if indexOfPage==0:
172 prepareReString=soup.find(name='a',attrs={"class":"go","href":"javascript:;"}).attrs['onclick']
173 pattern=re.compile('Math\.min\(Math\.max\(\$pn,\s\d\),(.*?)\)')
174 theMaxLength=int(re.findall(pattern,prepareReString)[0])
175 
176 #debug，檢測訪問到第幾頁
177 print('Accessing page {} of {}'.format(indexOfPage,CountryEntry['CountryURLDescription']))
178 #進入下一頁
179 indexOfPage+=1
180 
181 """
182 這里代碼實現對信息的有效信息的提取
183 """
184 for detailedDescriptionURL in getDetailedDescriptionURL(soup):
185 #如果詳細招聘信息入口URL是代理發布（即無效，這里不爬取這類信息），則跳過該條招聘信息
186 if not isUrlValid(detailedDescriptionURL):
187 continue
188 detailedDescriptionHtml=getPageHtml(detailedDescriptionURL)
189 #將分區標識符（例如java、php等）添加進返回的字典
190 result=detailedInformation(detailedDescriptionHtml)
191 result['ID']=CountryEntry['CountryURLDescription']
192 """
193 if 'ID' in result:
194 print(type(result['ID']),'>>>',result)
195 """
196 if result['Available']:
197 #獲取工資最小值和最大值
198 min_max=parseWage(result['wage'])
199 #有些公司沒有福利tag
200 reallyTag=''
201 if not result['tag']:
202 reallyTag='無'
203 else:
204 reallyTag=result['tag']
205 
206 insertSql="""insert into spider.{} values(0,'{}','{}','{}',{},{},'{}','{}','{}','{}','{}','{}','{}');""".format(getTableName(result['ID']),result['position'],result['company'],result['wage'],min_max[0],min_max[1],result['education'],result['workExperience'],result['language'],result['age'],result['description'],reallyTag,result['workPlace'])
207 
208 try:
209 cursor.execute(insertSql)
210 db.commit()
211 except:
212 db.rollback()
213 #debug
214 print('ERROR in function getCountryEmployeeInfo():execute sql failed.')
215 #爬取完該類招聘信息之后關閉數據庫連接
216 db.close()
217 
218 def getDetailedDescriptionURL(soup):
219 """
220 soup為全國招聘信息列表頁面解析的BeautifulSoup對象，該函數為生成器，每一次迭代產生一條招聘信息
221 詳細內容的URL字符串
222 """
223 if not soup:
224 #debug
225 print('ERROR in function getDetailedDescroption():soup is None.')
226 return None
227 
228 for item in soup.find_all(name='div',attrs={"class":"job-info"}):
229 detailedDescriptionURL=item.find(name='a',attrs={"target":"_blank"}).attrs['href']
230 yield detailedDescriptionURL
231 
232 
233 def detailedInformation(detailedDescriptionHtml):
234 """
235 該函數實現對具體的一條詳細招聘信息的提取，detailedDescriptionHtml為一條詳細招聘信息網頁的
236 HTML,該函數返回值為職位具體要求構成的字典positionDescription
237 """
238 if not detailedDescriptionHtml:
239 #debug
240 print('ERROR in function detailedInformation():detailedDescriptionHtml is None.')
241 return None
242 
243 soup=BeautifulSoup(detailedDescriptionHtml,'lxml')
244 
245 #提取出招聘職位和公司，類型為str
246 positionItem=soup.find(name='div',attrs={"class":"title-info"})
247 #有時候招聘信息被刪除了但是招聘信息的入口仍然留在招聘列表中，這里就是防止這種情況導致運行失敗
248 if not positionItem:
249 return {
250 'Available':False
251 }
252 position=positionItem.h1.text
253 company=soup.find(name='div',attrs={"class":"title-info"}).a.text
254 
255 #提取出工資水平（類型為str,有些是面議）、工作地點、學歷要求、工作經驗、語言要求和年齡要求
256 items=soup.find(name='div',attrs={"class":"job-title-left"})
257 wage=items.find(name='p',attrs={"class":"job-item-title"}).text.split('\r')[0]
258 workPlace=items.find(name='a')
259 #有些工作地點在國外，該網站不提供該地區招聘信息的網頁，沒有標簽a,這里就是處理這樣的異常情況
260 if not workPlace:
261 workPlace=items.find(name='p',attrs={"class":"basic-infor"}).span.text.strip()
262 else:
263 workPlace=workPlace.text
264 
265 #這里返回一個大小為4的列表，分別對應學歷要求、工作經驗、語言要求、年齡要求
266 allFourNec=items.find(name='div',attrs={"class":"job-qualifications"}).find_all(name='span')
267 
268 #有些招聘信息中帶有公司包含的福利tag，這里也提取出來,所有tag組成一個由-分隔的字符串,沒有則為空字符串
269 tagItems=soup.find(name='ul',attrs={"class":"comp-tag-list clearfix","data-selector":"comp-tag-list"})
270 tags=''
271 if tagItems:
272 tempTags=[]
273 for tag in tagItems.find_all(name='span'):
274 tempTags.append(tag.text)
275 tags='-'.join(tempTags)
276 
277 #提取出詳細的職位技能要求
278 descriptionItems=soup.find(name='div',attrs={"class":"job-item main-message job-description"})
279 description=descriptionItems.find(name='div',attrs={"class":"content content-word"}).text.strip()
280 
281 positionDescription={
282 "Available":True,
283 "position":position,
284 "company":company,
285 "wage":wage,
286 "workPlace":workPlace,
287 "education":allFourNec[0].text,
288 "workExperience":allFourNec[1].text,
289 "language":allFourNec[2].text,
290 "age":allFourNec[3].text,
291 "tag":tags,
292 "description":description,
293 }
294 
295 return positionDescription
296 if __name__=="__main__":
297 startURL='https://www.liepin.com/it/'
298 startHtml=getPageHtml(startURL)
299 
300 #多進程抓取數據
301 pool=Pool(4)
302 for entry in getEntry(startHtml):
303 countryEntry=getCountryEntry(entry)
304 pool.apply_async(getCountryEmployeeInfo,args=(countryEntry,))
305 pool.close()
306 pool.join()
307 print('All subprocesses done.')

2.2 代碼的部分補充說明

2.2.1 getEntry(html)

　　爬蟲初始入口地址startURL對應於獵聘網下圖所示的頁面

　　在該頁面對應於代碼中的函數getEntry(html)，下面是子項目入口地址於HTML中所在的位置：

2.2.2 getCountryEntry(entry)

　　對應於如下頁面：

　　不僅僅只抓取當前地區，需要抓取全國的招聘信息，因此要進入全國招聘信息的URL，位置如下所示：

　　第一條href即為全國的入口URL。

　　除此之外在該函數中還根據每一個招聘信息子分類創建了各自的表，由於有些分類名中含有MySQL數據庫表名不支持的特殊符號，所以函數getTableName(ID)功能就是根據分類名ID返回合法表名。

2.2.3 getCountryEmployeeInfo(CountryEntry)

　　對應如下頁面：

　　在該頁面可以抓取到每一條招聘信息詳細信息的入口地址URL，以及該分類（在上圖中就是Java）中總頁面數量：

　　招聘信息詳細信息的入口地址URL位置如下所示：

　　總頁面數所在位置如下所示（即div class="pager"那一個標簽中）：

2.2.4 detailedInformation(detailedDescriptionHtml)

　　對應於如下所示的頁面結構：

　　具體在該頁面上抓取的信息所在位置在函數detailedInformation(detailedDescriptionHtml)中表示得很明白。

五、數據庫部分

　　在上面給出的代碼中，在函數getCountryEntry中對每一類招聘信息都在數據庫spider中（spider是我事先已經建立好的一個schema）建立了相應的表，其中有一個positionDescription表是我在抓取前建立的一個模板表，目的是為了便於各分類表的創建，各分類表的結構都和positionDescription表一樣，其字段類型如下所示：

　　然后抓取之后就會生成各分類的表，這些表的結構和positionDescription表結構完全一樣，例如（未完全顯示）：

六、數據可視化的實現

6.1 直方圖

6.1.1 各分類招聘信息數量直方圖

import matplotlib.pyplot as plt
import matplotlib
import pymysql
 
def drawPic():
    db=pymysql.connect(host='localhost',user='root',password='123456',port=3306,db='spider')
    cursor=db.cursor()
 
    try:
        cursor.execute("select table_name as 表名,table_rows from information_schema.tables where table_schema='spider' order by table_rows desc;")
        results=cursor.fetchall()
        #print(results)
        tags=[]
        amount=[]
        for item in results:
            if item[1]:
                tags.append(item[0])
                amount.append(item[1])
    
        
    except:
        print('failed')
    db.close()
 
    #解決中文顯示亂碼問題
    plt.rcParams['font.sans-serif']=['FangSong']
    plt.rcParams['axes.unicode_minus']=False
 
    plt.barh(range(len(tags)),amount,height=0.7,color='steelblue',alpha=0.8)
    plt.yticks(range(len(tags)),tags)
    plt.xlim(min(amount)-10,max(amount)+100)
    plt.xlabel("招聘信息數量")
    plt.title("各分類招聘信息數量")
    for x,y in enumerate(amount):
        plt.text(y+1,x-0.4,'%s' % y)
    plt.show()
 
if __name__=='__main__':
    drawPic()

　　效果如下所示：

6.2 圓餅圖

6.2.1 按照學歷要求分類繪制圓餅圖

代碼如下：

import matplotlib.pyplot as plt
import matplotlib
import pymysql
 
#根據學歷要求繪制圓餅圖
def drawPic():
    plt.rcParams['font.sans-serif']=['FangSong']
    plt.rcParams['axes.unicode_minus']=False
 
    labels='本科及以上','大專及以上','統招本科','學歷不限','其它'
    sizes=[39519/79380*100,16726/79380*100,15844/79380*100,5781/79380*100,(1102+211+197)/79380*100]
    explode=(0.1,0,0,0,0)
 
    fig1,ax1=plt.subplots()
    ax1.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=True,startangle=90)
    ax1.axis('equal')
    ax1.set_title('招聘信息學歷要求占比')
 
    plt.show()

　　效果如下所示：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。