本人呢,算是學統計的,就想着爬一下智聯的統計崗位信息,嗯,崗位很強勢。。。
這里用了requests,bs4進行抓取與解析,數據存入mysql數據庫。代碼比較亂,先湊和着看,有時間再整理吧。。。
import requests from bs4 import BeautifulSoup import re import time import datetime import MySQLdb now = datetime.datetime.now() # 獲取網頁數據,並解析網頁 def get_save_url(url, headers): html = requests.get(url, headers=headers) if html.status_code == 200: print("網頁" + url + "已經打開") soup = BeautifulSoup(html.text, 'lxml') global soup html = html.text global html get_post_fact_city_num(soup,html) print("正在抓取第"+pagenum+"頁的職位信息") # 定義解析網頁的函數 def get_post_fact_city_num(soupx,htmlx): l_post_name = [] l_fact_name = [] l_city_name = [] l_num = [] l_lab = [] l_post_time = [] l_now_time = [] regex = "__ga__fullResult(.*)postname_clicksfullresult(.*)postnames_001" posts = soup.findAll("a", {"class":re.compile(regex)}) for post in posts[::2]: post = post.get_text() #print(post) l_post_name.append(post) facts = soup.findAll("p", {"class":"searchResultCompanyname"}) for fact in facts[::2]: fact = fact.get_text() l_fact_name.append(fact) cities = soup.findAll("em", {"class":"searchResultJobCityval"}) for city in cities[::2]: city = city.get_text() l_city_name.append(city) nums = soup.findAll("em", {"class":"searchResultJobPeopnum"}) for num in nums: num = num.get_text() l_num.append(num) #print("nums: "+str(inums)) labs = soup.findAll("p", {"class":"searchResultCompanyIndustry"}) for lab in labs: lab = lab.get_text() l_lab.append(lab) time_regex = "<span>發布時間\:<em></em>(.*)</span>" time_pa = re.compile(time_regex) times = re.findall(time_pa,html) itimes = 1 for time in times: l_post_time.append(time) save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time) headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"} # 這里頁數可以在搜索結果包含的范圍內任意設置 urls = ["http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_{}_0".format(str(x)) for x in range(2, 20)] #url = "http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_2_0" db = MySQLdb.connect(host="localhost", user="root", passwd="這里是你的密碼,我的不給看",db="test",use_unicode=True,charset="utf8") cursor = db.cursor() # 這里注意字符的大小盡量大,有些公司介紹比較長。。。 sqlxx = """CREATE TABLE zhilian_tongji( post_name VARCHAR(100), fact_name VARCHAR(100), city_name VARCHAR(20), num VARCHAR(50), lab VARCHAR(200), post_time VARCHAR(50), now_time VARCHAR(50) ) """ cursor.execute(sqlxx) def save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time): now_time = datetime.datetime.now() sql = """INSERT INTO zhilian_tongji\ SET post_name=%s, fact_name=%s, city_name=%s, num=%s, lab=%s, post_time=%s, now_time=%s""" for x in range(0,len(l_post_name)): #print(len(l_post_name)) #print(x) #print(l_fact_name) cursor.execute(sql,(l_post_name[x],l_fact_name[x],l_city_name[x],l_num[x],l_lab[x],l_post_time[x],now_time)) db.commit() print("抓取成功,已存入數據庫!") for url in urls: try: time.sleep(1) pagenum = url.split("_")[-2] get_save_url(url=url,headers=headers) except: print("第 "+str(pagenum)+" 失敗...") pass db.close() print("大功告成!!!")
代碼輸出結果如下。
數據庫查詢結果如下。