代碼如下:
from DrawStu.DrawStu import DrawStu; import time; import io import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #初始化class 得到對象 draw=DrawStu(); if __name__ == '__main__': print('爬取研究生調劑信息'); size=draw.get_page_size(); print(size) for x in range(size): start=x*50; print(start); #print(); created_url='https://yz.chsi.com.cn/kyzx/tjxx/?start='+str(start); draw.draw_base_list(created_url); pass
import sqlite3; class DB(object): """數據庫訪問方法的實現""" """初始化api 產生數據操作的對象 conect 操作的游標""" def __init__(self): self.conn={}; self.cus={}; #初始化數據庫鏈接的api #1產生數據庫鏈接對象 self.conn=sqlite3.connect(r'Test.db'); #2.產生操作的游標 self.cus=self.conn.cursor(); pass; def create_table(self): sql = " CREATE TABLE if not exists mynews (CrawlTime char,Title char,Content char,PublishTime char,Origin char)" self.conn.execute(sql) self.conn.commit() print('create table successfully') def insert_into_news(self,ops): self.conn.execute('insert into mynews(CrawlTime,Title,Content,PublishTime,Origin) values(?,?,?,?,?)',(ops['CrawlTime'],ops['Title'],ops['Content'],ops['PublishTime'],ops['Origin'],)); self.conn.commit(); pass
#要求使用urllib3 import urllib.request; from bs4 import BeautifulSoup; from DB.DB import DB; db=DB(); import time; """爬取核心的核心模塊,功能只負責爬取研究生調劑信息""" class DrawStu(): """docstring for DrawStu""" def __init__(self): self.baseurl='https://yz.chsi.com.cn/kyzx/tjxx/'; db.create_table(); pass; #提取公共的爬取信息的api def commonsdk(self,url): response=urllib.request.urlopen(url);#注意 寫在內部以后 變成了形參 html=response.read();#read進行亂碼處理 print(html); doc=BeautifulSoup(html); return doc; #爬取基本列表 def draw_base_list(self,url): print('url is:::',url); doc=self.commonsdk(url); lilist=doc.find('ul',{'class':'news-list'}).findAll('li'); #print(lilist); #爬取一級參數 for x in lilist: Title=x.find('a').text; Time=x.find('span').text Link='https://yz.chsi.com.cn'+x.find('a').get('href'); #print(Link); self.draw_detail_list(Link,Title,Time); pass pass #爬取二級詳情的信息參數 def draw_detail_list(self,url,Title,Time): doc=self.commonsdk(url); from_info=doc.find('span',{'class':'news-from'}).text; content=doc.find('div',{'class':'content-l detail'}).text; ctime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()); #將數據 拼合成字典 交給數據庫存儲的api data={ 'CrawlTime':ctime, 'Title':Title, 'Content':content, 'PublishTime':Time, 'Origin':from_info } print(data); print('插入數據庫中'); db.insert_into_news(data); pass #爬取頁面的總頁數 def get_page_size(self): requesturl=self.baseurl; pcxt=self.commonsdk(requesturl).find('div',{'class':'pageC'}).findAll('span')[0].text; print(pcxt); #re正則表達式 字符串截取api pagesize=pcxt.strip(); pagearr=pagesize.split('/'); pagestr=pagearr[1]; return int(pagestr[0:2]); pass
F12查看網頁元素
爬取結果:
轉化成數據庫表格形式,采用database net軟件,效果如下:
新建查詢輸入:select *from mynews
其中在錄每一個學校的信息都能查詢