python——爬取中國研究生信息網調劑信息

本文轉載自查看原文 2019-11-24 19:37 567

代碼如下：

from DrawStu.DrawStu import DrawStu;
import time;
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
#初始化class 得到對象
draw=DrawStu();
if __name__ == '__main__':
    print('爬取研究生調劑信息');
    size=draw.get_page_size();
    print(size)
    for x in range(size):
        start=x*50;
        print(start);
        #print();
        created_url='https://yz.chsi.com.cn/kyzx/tjxx/?start='+str(start);
        draw.draw_base_list(created_url);
        
        pass

import sqlite3;

class DB(object):
    """數據庫訪問方法的實現"""
    """初始化api 產生數據操作的對象 conect 操作的游標"""
    def __init__(self):
        self.conn={};
        self.cus={};
        #初始化數據庫鏈接的api
        #1產生數據庫鏈接對象
        self.conn=sqlite3.connect(r'Test.db');
        #2.產生操作的游標
        self.cus=self.conn.cursor();
        pass;
    def create_table(self):
        
        sql = " CREATE TABLE if not exists mynews (CrawlTime char,Title char,Content char,PublishTime char,Origin char)"
        self.conn.execute(sql)
        self.conn.commit()
        print('create table successfully')
    def insert_into_news(self,ops):
        self.conn.execute('insert into mynews(CrawlTime,Title,Content,PublishTime,Origin) values(?,?,?,?,?)',(ops['CrawlTime'],ops['Title'],ops['Content'],ops['PublishTime'],ops['Origin'],));
        self.conn.commit();
        pass

#要求使用urllib3
import urllib.request;
from bs4 import BeautifulSoup;
from DB.DB import DB;

db=DB();
import time;
"""爬取核心的核心模塊，功能只負責爬取研究生調劑信息"""


class DrawStu():
    """docstring for DrawStu"""
    def __init__(self):
        self.baseurl='https://yz.chsi.com.cn/kyzx/tjxx/';
        db.create_table();
        pass;

    #提取公共的爬取信息的api
    def commonsdk(self,url):
        response=urllib.request.urlopen(url);#注意 寫在內部以后 變成了形參 
        html=response.read();#read進行亂碼處理
        print(html);
        doc=BeautifulSoup(html);
        return doc;


    #爬取基本列表
    def draw_base_list(self,url):
        print('url is:::',url);
        doc=self.commonsdk(url);
        lilist=doc.find('ul',{'class':'news-list'}).findAll('li');
        #print(lilist);
        #爬取一級參數
        for x in lilist:
            Title=x.find('a').text;
            Time=x.find('span').text
            Link='https://yz.chsi.com.cn'+x.find('a').get('href');
            #print(Link);
            self.draw_detail_list(Link,Title,Time);
            pass

        pass

    #爬取二級詳情的信息參數
    def draw_detail_list(self,url,Title,Time):
        doc=self.commonsdk(url);
        from_info=doc.find('span',{'class':'news-from'}).text;
        
        content=doc.find('div',{'class':'content-l detail'}).text;
        
        ctime=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime());
        
        #將數據 拼合成字典 交給數據庫存儲的api
        data={
          'CrawlTime':ctime,
          'Title':Title,
          'Content':content,
          'PublishTime':Time,
          'Origin':from_info
        }
        print(data);
        print('插入數據庫中');

        db.insert_into_news(data);
        pass

    #爬取頁面的總頁數
    def get_page_size(self):
        requesturl=self.baseurl;
        pcxt=self.commonsdk(requesturl).find('div',{'class':'pageC'}).findAll('span')[0].text;
        print(pcxt);
        #re正則表達式 字符串截取api
        pagesize=pcxt.strip();
        pagearr=pagesize.split('/');
        pagestr=pagearr[1];
        return int(pagestr[0:2]);
        pass

F12查看網頁元素

爬取結果：

轉化成數據庫表格形式，采用database net軟件，效果如下：

新建查詢輸入：select *from mynews

其中在錄每一個學校的信息都能查詢

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 中國研究生招生信息網查詢各個學校專業方法爬蟲---爬取拉鈎信息網 python爬蟲實戰（一）--------中國作物種質信息網 python爬取中國知網部分論文信息爬取實習網實習生招聘信息 (轉)python爬取拉勾網信息 2020研究生考試各專業各學校的研究生錄取分數和復試錄取人數比率(各大學校研究生錄取信息的官網直通車) python——拉勾網信息爬取 Python爬取招聘網信息爬蟲框架之Scrapy——爬取某招聘信息網站