python實戰項目之爬蟲（一）

本文轉載自查看原文 2018-09-04 15:26 1181 python

因為馬上就要大四實習了，博主實在懶得在學校官網上一個個翻，直接用爬蟲將所有數據都爬下來

放在表格里，這樣感覺簡單多了，可惜還沒找到工作，so sad

總共選擇了三個學校：湖南大學，中南大學，湘潭大學

三個項目代碼分別如下（新手代碼，慘不忍睹）：

湘潭大學：

#!/usr/bin/python3
#coding=utf-8
import requests
import json
import logging
import os
import xlwt
basic_url = 'http://jobs.xtu.edu.cn/index/getdaycareers?day=2018-10-'
logging.basicConfig(level=logging.DEBUG,format='')

workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('list1')
sheet1.write(0,0,'時間')
sheet1.write(0,1,'地點')
sheet1.write(0,2,'公司名稱')
sheet1.write(0,3,'專業要求')
sheet1.write(0,5,'詳細信息')
count=1
for i in range(1,32):
    url = basic_url+str(i)
    logging.debug('the clawer web site is:'+url)
    clawertext = requests.get(url)
    logging.debug(type(clawertext))
    logging.debug(clawertext.json())
    logging.debug(clawertext.json()['data'])
    logging.debug(type(clawertext.json()['data']))
    
    data_list = clawertext.json()['data']#the useful data 

    for i in data_list:
        sheet1.write(count,0,i['meet_day'])
        sheet1.write(count,1,i['address'])
        sheet1.write(count,2,i['meet_name'])
        sheet1.write(count,3,i['professionals'])
        sheet1.write(count,5,'http://jobs.xtu.edu.cn/detail/career?id='+i['career_talk_id'])
        count=count+1
workbook.save('湘潭大學十月份招聘信息.xlsx')

中南大學：

這個最坑，花了我一個多小時

#!/usr/bin/python3
#coding=utf-8
import requests
import xlwt
import json
import logging
import bs4
from bs4 import BeautifulSoup
#初始化日志保存路勁，及格式
logging.basicConfig(filename='log.txt',level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger('requests').setLevel(logging.WARNING)#禁用requests的日志

#初始化表格
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('list')
sheet1.write(0,0,'時間')
sheet1.write(0,1,'地點')
sheet1.write(0,2,'公司名稱')
sheet1.write(0,3,'職位名稱')
sheet1.write(0,4,'教育水平')
sheet1.write(0,5,'專業要求')
sheet1.write(0,6,'空缺數量')
sheet1.write(0,7,'詳細信息')

#初始化地址
json_all_url = 'http://jobsky.csu.edu.cn/Home/SearchDateAllMonth'
dt1={'Date':'2018-09-04'}
post_data = requests.post(json_all_url,data=dt1)
json_data = post_data.json()
logging.debug(type(json_data))
'''with open('json.txt','w') as fileTxt:
    for i in json_data:
        fileTxt.write(str(i)+'\n')    
'''
basic_html_url = 'http://jobsky.csu.edu.cn/Home/ArticleDetails/'

counter_all = 1
for data in json_data:
    company_Id=data['NewsID']
    #logging.debug('the commpanyID is:'+company_Id)
    html_url=basic_html_url+company_Id
#html_url=basic_html_url+'13713'#static url,please delete and repaire after you have used it
    
    html_txt = requests.get(html_url)
#  logging.debug('the web site using code is:'+str(html_txt.status_code))
    bs = BeautifulSoup(html_txt.text,'lxml')
    
    #get the commpanyName
    
    list_soup_CN = bs.find('h1',attrs={'class':'text-center title'})
    try:    
        advertise_company_name=list_soup_CN.getText()
        sheet1.write(counter_all,2,advertise_company_name)
    except:
        logging.debug("the url"+html_url+'has some problem')
    #get the time and place
    try:
        list_soup_TP = bs.find('div',attrs={'id':'placeAndTime'})
        advertise_time=list_soup_TP.find('p',attrs={'class':'text-center time'}).getText()
        advertise_place=list_soup_TP.find('p',attrs={'class':'text-center place'}).getText()
        sheet1.write(counter_all,0,advertise_time)
        sheet1.write(counter_all,1,advertise_place)
    except:
        logging.debug("the url"+html_url+'has some problem')
    
    try:     
        list_soup_demand = bs.find('table',attrs={'class':'table table-bordered'})
        list_td = list_soup_demand.find_all('td')
        counter_even = 0#use to counter ,so that we can find the number of td,and get we need data
        #we can get the useful data by looking the source
        for td in list_td:
            if counter_even==1 :
                sheet1.write(counter_all,3,td.getText())
            if counter_even==3 :
                sheet1.write(counter_all,4,td.getText())
            if counter_even==5 :
                sheet1.write(counter_all,5,td.getText())
            if counter_even==7 :
                sheet1.write(counter_all,6,td.getText())
            counter_even =counter_even+1
        sheet1.write(counter_all,7,html_url)
        counter_all+=1
    except:
        logging.debug("the url"+html_url+'has some problem')
    #保存文件
    workbook.save('中南大學招聘信息.xlsx')

View Code

最后是湖南大學，不知道為什么，湖南大學招聘信息少的可憐

#!/usr/bin/python3
#coding=utf-8
import requests
import json
import logging
import os
import xlwt
json_url = 'https://hnu.bysjy.com.cn/module/getcareers?start_page=1&keyword=&type=inner&day=&count=15&start=1&_=1536044186160'
logging.basicConfig(level=logging.DEBUG,format='')

json_data = requests.get(json_url)
#print(json_data.text)

workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('list1')
sheet1.write(0,0,'時間')
sheet1.write(0,1,'地點')
sheet1.write(0,2,'公司名稱')
sheet1.write(0,3,'招聘會')
sheet1.write(0,4,'專業要求')
sheet1.write(0,6,'詳細信息')
count=1
    
data_list = json_data.json()['data']#the useful data 

for i in data_list:
        sheet1.write(count,0,i['meet_day']+i['meet_time'])
        sheet1.write(count,1,i['address'])
        sheet1.write(count,2,i['company_name'])
        sheet1.write(count,3,i['meet_name'])
        sheet1.write(count,4,i['professionals'])
        sheet1.write(count,5,'https://hnu.bysjy.com.cn/detail/career?id='+i['career_talk_id'])
        count=count+1
workbook.save('湖南大學招聘信息.xlsx')

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python爬蟲開發與項目實戰 Java爬蟲項目實戰（一）爬蟲實戰項目集合 Java爬蟲項目實戰（一）簡單爬蟲項目實戰（一） 2020最新Python 爬蟲入門實戰項目，新手必學自學Python七爬蟲實戰一 Python 爬蟲入門實戰 Python實戰：爬蟲的基礎 python爬蟲實戰