因為馬上就要大四實習了,博主實在懶得在學校官網上一個個翻,直接用爬蟲將所有數據都爬下來
放在表格里,這樣感覺簡單多了,可惜還沒找到工作,so sad
總共選擇了三個學校:湖南大學,中南大學,湘潭大學
三個項目代碼分別如下(新手代碼,慘不忍睹):
湘潭大學:
#!/usr/bin/python3 #coding=utf-8 import requests import json import logging import os import xlwt basic_url = 'http://jobs.xtu.edu.cn/index/getdaycareers?day=2018-10-' logging.basicConfig(level=logging.DEBUG,format='') workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('list1') sheet1.write(0,0,'時間') sheet1.write(0,1,'地點') sheet1.write(0,2,'公司名稱') sheet1.write(0,3,'專業要求') sheet1.write(0,5,'詳細信息') count=1 for i in range(1,32): url = basic_url+str(i) logging.debug('the clawer web site is:'+url) clawertext = requests.get(url) logging.debug(type(clawertext)) logging.debug(clawertext.json()) logging.debug(clawertext.json()['data']) logging.debug(type(clawertext.json()['data'])) data_list = clawertext.json()['data']#the useful data for i in data_list: sheet1.write(count,0,i['meet_day']) sheet1.write(count,1,i['address']) sheet1.write(count,2,i['meet_name']) sheet1.write(count,3,i['professionals']) sheet1.write(count,5,'http://jobs.xtu.edu.cn/detail/career?id='+i['career_talk_id']) count=count+1 workbook.save('湘潭大學十月份招聘信息.xlsx')
中南大學:
這個最坑,花了我一個多小時
#!/usr/bin/python3 #coding=utf-8 import requests import xlwt import json import logging import bs4 from bs4 import BeautifulSoup #初始化日志保存路勁,及格式 logging.basicConfig(filename='log.txt',level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s') logging.getLogger('requests').setLevel(logging.WARNING)#禁用requests的日志 #初始化表格 workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('list') sheet1.write(0,0,'時間') sheet1.write(0,1,'地點') sheet1.write(0,2,'公司名稱') sheet1.write(0,3,'職位名稱') sheet1.write(0,4,'教育水平') sheet1.write(0,5,'專業要求') sheet1.write(0,6,'空缺數量') sheet1.write(0,7,'詳細信息') #初始化地址 json_all_url = 'http://jobsky.csu.edu.cn/Home/SearchDateAllMonth' dt1={'Date':'2018-09-04'} post_data = requests.post(json_all_url,data=dt1) json_data = post_data.json() logging.debug(type(json_data)) '''with open('json.txt','w') as fileTxt: for i in json_data: fileTxt.write(str(i)+'\n') ''' basic_html_url = 'http://jobsky.csu.edu.cn/Home/ArticleDetails/' counter_all = 1 for data in json_data: company_Id=data['NewsID'] #logging.debug('the commpanyID is:'+company_Id) html_url=basic_html_url+company_Id #html_url=basic_html_url+'13713'#static url,please delete and repaire after you have used it html_txt = requests.get(html_url) # logging.debug('the web site using code is:'+str(html_txt.status_code)) bs = BeautifulSoup(html_txt.text,'lxml') #get the commpanyName list_soup_CN = bs.find('h1',attrs={'class':'text-center title'}) try: advertise_company_name=list_soup_CN.getText() sheet1.write(counter_all,2,advertise_company_name) except: logging.debug("the url"+html_url+'has some problem') #get the time and place try: list_soup_TP = bs.find('div',attrs={'id':'placeAndTime'}) advertise_time=list_soup_TP.find('p',attrs={'class':'text-center time'}).getText() advertise_place=list_soup_TP.find('p',attrs={'class':'text-center place'}).getText() sheet1.write(counter_all,0,advertise_time) sheet1.write(counter_all,1,advertise_place) except: logging.debug("the url"+html_url+'has some problem') try: list_soup_demand = bs.find('table',attrs={'class':'table table-bordered'}) list_td = list_soup_demand.find_all('td') counter_even = 0#use to counter ,so that we can find the number of td,and get we need data #we can get the useful data by looking the source for td in list_td: if counter_even==1 : sheet1.write(counter_all,3,td.getText()) if counter_even==3 : sheet1.write(counter_all,4,td.getText()) if counter_even==5 : sheet1.write(counter_all,5,td.getText()) if counter_even==7 : sheet1.write(counter_all,6,td.getText()) counter_even =counter_even+1 sheet1.write(counter_all,7,html_url) counter_all+=1 except: logging.debug("the url"+html_url+'has some problem') #保存文件 workbook.save('中南大學招聘信息.xlsx')
最后是湖南大學,不知道為什么,湖南大學招聘信息少的可憐
#!/usr/bin/python3 #coding=utf-8 import requests import json import logging import os import xlwt json_url = 'https://hnu.bysjy.com.cn/module/getcareers?start_page=1&keyword=&type=inner&day=&count=15&start=1&_=1536044186160' logging.basicConfig(level=logging.DEBUG,format='') json_data = requests.get(json_url) #print(json_data.text) workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('list1') sheet1.write(0,0,'時間') sheet1.write(0,1,'地點') sheet1.write(0,2,'公司名稱') sheet1.write(0,3,'招聘會') sheet1.write(0,4,'專業要求') sheet1.write(0,6,'詳細信息') count=1 data_list = json_data.json()['data']#the useful data for i in data_list: sheet1.write(count,0,i['meet_day']+i['meet_time']) sheet1.write(count,1,i['address']) sheet1.write(count,2,i['company_name']) sheet1.write(count,3,i['meet_name']) sheet1.write(count,4,i['professionals']) sheet1.write(count,5,'https://hnu.bysjy.com.cn/detail/career?id='+i['career_talk_id']) count=count+1 workbook.save('湖南大學招聘信息.xlsx')
