因为马上就要大四实习了,博主实在懒得在学校官网上一个个翻,直接用爬虫将所有数据都爬下来
放在表格里,这样感觉简单多了,可惜还没找到工作,so sad
总共选择了三个学校:湖南大学,中南大学,湘潭大学
三个项目代码分别如下(新手代码,惨不忍睹):
湘潭大学:
#!/usr/bin/python3 #coding=utf-8 import requests import json import logging import os import xlwt basic_url = 'http://jobs.xtu.edu.cn/index/getdaycareers?day=2018-10-' logging.basicConfig(level=logging.DEBUG,format='') workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('list1') sheet1.write(0,0,'时间') sheet1.write(0,1,'地点') sheet1.write(0,2,'公司名称') sheet1.write(0,3,'专业要求') sheet1.write(0,5,'详细信息') count=1 for i in range(1,32): url = basic_url+str(i) logging.debug('the clawer web site is:'+url) clawertext = requests.get(url) logging.debug(type(clawertext)) logging.debug(clawertext.json()) logging.debug(clawertext.json()['data']) logging.debug(type(clawertext.json()['data'])) data_list = clawertext.json()['data']#the useful data for i in data_list: sheet1.write(count,0,i['meet_day']) sheet1.write(count,1,i['address']) sheet1.write(count,2,i['meet_name']) sheet1.write(count,3,i['professionals']) sheet1.write(count,5,'http://jobs.xtu.edu.cn/detail/career?id='+i['career_talk_id']) count=count+1 workbook.save('湘潭大学十月份招聘信息.xlsx')
中南大学:
这个最坑,花了我一个多小时

#!/usr/bin/python3 #coding=utf-8 import requests import xlwt import json import logging import bs4 from bs4 import BeautifulSoup #初始化日志保存路劲,及格式 logging.basicConfig(filename='log.txt',level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s') logging.getLogger('requests').setLevel(logging.WARNING)#禁用requests的日志 #初始化表格 workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('list') sheet1.write(0,0,'时间') sheet1.write(0,1,'地点') sheet1.write(0,2,'公司名称') sheet1.write(0,3,'职位名称') sheet1.write(0,4,'教育水平') sheet1.write(0,5,'专业要求') sheet1.write(0,6,'空缺数量') sheet1.write(0,7,'详细信息') #初始化地址 json_all_url = 'http://jobsky.csu.edu.cn/Home/SearchDateAllMonth' dt1={'Date':'2018-09-04'} post_data = requests.post(json_all_url,data=dt1) json_data = post_data.json() logging.debug(type(json_data)) '''with open('json.txt','w') as fileTxt: for i in json_data: fileTxt.write(str(i)+'\n') ''' basic_html_url = 'http://jobsky.csu.edu.cn/Home/ArticleDetails/' counter_all = 1 for data in json_data: company_Id=data['NewsID'] #logging.debug('the commpanyID is:'+company_Id) html_url=basic_html_url+company_Id #html_url=basic_html_url+'13713'#static url,please delete and repaire after you have used it html_txt = requests.get(html_url) # logging.debug('the web site using code is:'+str(html_txt.status_code)) bs = BeautifulSoup(html_txt.text,'lxml') #get the commpanyName list_soup_CN = bs.find('h1',attrs={'class':'text-center title'}) try: advertise_company_name=list_soup_CN.getText() sheet1.write(counter_all,2,advertise_company_name) except: logging.debug("the url"+html_url+'has some problem') #get the time and place try: list_soup_TP = bs.find('div',attrs={'id':'placeAndTime'}) advertise_time=list_soup_TP.find('p',attrs={'class':'text-center time'}).getText() advertise_place=list_soup_TP.find('p',attrs={'class':'text-center place'}).getText() sheet1.write(counter_all,0,advertise_time) sheet1.write(counter_all,1,advertise_place) except: logging.debug("the url"+html_url+'has some problem') try: list_soup_demand = bs.find('table',attrs={'class':'table table-bordered'}) list_td = list_soup_demand.find_all('td') counter_even = 0#use to counter ,so that we can find the number of td,and get we need data #we can get the useful data by looking the source for td in list_td: if counter_even==1 : sheet1.write(counter_all,3,td.getText()) if counter_even==3 : sheet1.write(counter_all,4,td.getText()) if counter_even==5 : sheet1.write(counter_all,5,td.getText()) if counter_even==7 : sheet1.write(counter_all,6,td.getText()) counter_even =counter_even+1 sheet1.write(counter_all,7,html_url) counter_all+=1 except: logging.debug("the url"+html_url+'has some problem') #保存文件 workbook.save('中南大学招聘信息.xlsx')
最后是湖南大学,不知道为什么,湖南大学招聘信息少的可怜

#!/usr/bin/python3 #coding=utf-8 import requests import json import logging import os import xlwt json_url = 'https://hnu.bysjy.com.cn/module/getcareers?start_page=1&keyword=&type=inner&day=&count=15&start=1&_=1536044186160' logging.basicConfig(level=logging.DEBUG,format='') json_data = requests.get(json_url) #print(json_data.text) workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('list1') sheet1.write(0,0,'时间') sheet1.write(0,1,'地点') sheet1.write(0,2,'公司名称') sheet1.write(0,3,'招聘会') sheet1.write(0,4,'专业要求') sheet1.write(0,6,'详细信息') count=1 data_list = json_data.json()['data']#the useful data for i in data_list: sheet1.write(count,0,i['meet_day']+i['meet_time']) sheet1.write(count,1,i['address']) sheet1.write(count,2,i['company_name']) sheet1.write(count,3,i['meet_name']) sheet1.write(count,4,i['professionals']) sheet1.write(count,5,'https://hnu.bysjy.com.cn/detail/career?id='+i['career_talk_id']) count=count+1 workbook.save('湖南大学招聘信息.xlsx')