python爬蟲,爬取一系列新聞


這個作業的要求來自於:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/2941

由於存在多次請求,所以稍微將請求封裝如下

def tranfrom_dom_tree(url):
    '''
        將獲取的html文本轉化為dom樹
    '''
    response = requests.get(url);
    response.encoding = "utf-8";
    return BeautifulSoup(response.text, "html.parser");

 

將具體新聞內容封裝如下

class News(object):
    '''
        廣商校園新聞數據模型
    '''
    def __init__(self, url):
        self._url = url;             #新聞網頁地址
        self._dom_tree = tranfrom_dom_tree(url);
        self._show_infos = self._dom_tree.select(".show-info")[0].text.split();
        self._update_time = "";
        self._auditor = "";
        self._auothor = "";
        self._origin = "";

        for index, args in enumerate(self._show_infos):
            if args.startswith("發布時間"):
                self._update_time = args[5:] + " " + self._show_infos[index+1];
                continue;
            elif args.startswith("作者"):
                self._auothor = args[3:];
                continue;
            elif args.startswith("審核"):
                self._auditor = args[3:];
                continue;
            elif args.startswith("來源"):
                self._origin = args[3:];
                continue;

    @property
    def title(self):
        '''
            :return:  新聞標題
        '''
        return self._dom_tree.select(".show-title")[0].text;

    @property
    def auothor(self):
        '''
            :return:  新聞作者
        '''
        return self._auothor;

    @property
    def auditor(self):
        '''
            :return:  新聞審核
        '''
        return self._auditor;

    @property
    def origin(self):
        '''
            :return:  新聞發布單位
        '''
        return self._origin;

    @property
    def update_time(self):
        '''
            :return:  新聞最后更新時間
        '''
        return self._update_time;

    @update_time.setter
    def update_time(self, time):
        '''
            設置最后更新時間
            :param time:  時間
        '''
        self._update_time = time;

    @property
    def times(self):
        '''
            :return:  點擊次數
        '''
        clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(self.news_id);
        response = requests.get(clickUrl);
        click = re.findall('(\d+)', response.text)[-1];
        return click;

    @property
    def news_id(self):
        '''
            :return:  新聞標識
        '''
        time = datetime.strptime(self._update_time, '%Y-%m-%d %H:%M:%S');
        time = time.strftime("%m%d");
        return re.match('http://news.gzcc.cn/html/.*/.*/(\d+).html', self._url).group(1);

    @property
    def summary(self):
        '''
            :return:  新聞摘要內容
        '''
        return self._summary;

    @summary.setter
    def summary(self, text):
        '''
            設置新聞摘要
            :param text:  新聞摘要
        '''
        self._summary = text;

    def to_dict(self):
        '''
            將此類實例轉換為字典
            :return: 轉換后的字典
        '''
        dict = {};
        dict["news_url"] = self._url;
        dict["news_id"] = self.news_id
        dict["news_title"] = self.title;
        dict["news_summary"] = self.summary;
        dict["news_update_time"] = self.update_time;
        dict["news_times"] = self.times;
        dict["news_auothor"] = self.auothor;
        dict["news_auditor"] = self.auditor;
        dict["news_origin"] = self.origin;
        return dict;

 

  

對新聞進行批操作代碼如下

class GZCCNewsReptile(object):
    '''
        廣州商學院校園新聞獲取工具
    '''
    def __init__(self):
        self._news_type = "dict";
        self._root_url = "http://news.gzcc.cn/html/xiaoyuanxinwen/";
        self._url = self.page_url();
        self._dom_tree = tranfrom_dom_tree(self._url);

    def page_url(self, page=1):
        '''
            將指定頁面解析數值解析對應url
            :param page:  指定的一個新頁面
        '''
        if page == 1:
            self._now_page = "index";
        else:
            self._now_page = page;
        return self._root_url+str(self._now_page)+".html";

    @property
    def count(self):
        '''
            :return:  返回校園新聞總條數
        '''
        count = self._dom_tree.select(".a1")[0].text;
        return int(count[0:-1]);

    @property
    def page(self):
        '''
            :return:  獲取校園新聞總頁數
        '''
        page = int(self.count) / 10;
        int_page = int(page);
        if page > int_page:
            return ( int_page + 1);
        else:
            return int_page;

    def get_news_from_news_page_size(self, start_page, end_page):
        '''
            設置爬取頁數范圍,
            start_page 小於 1 拋出    異常
            start_page 大於 end_page 拋出 異常
            end_page 大於 總頁數 拋出 異常
            :param start_page:  要爬取范圍的開始頁
            :param end_page:    結束爬取范圍的頁面(不包括該頁面)
        '''
        if start_page < 1:
            raise IndexError("start_page不在指定范圍內");
        if start_page > end_page:
            raise IndexError("start_page大於end_page");
        if end_page > self.page:
            raise IndexError("end_page不在指定范圍內");
        news_list = [];
        times = (index for index in range(start_page, end_page));
        #  爬取指定范圍數據
        for index in times:
            #  news_page_list = self.get_news_from_page_url(self.page_url(index));
            news_page_list = self.get_page_news(index);  #  較上句更利於封裝
            news_list.append(news_page_list);
        news_list = sum(news_list, []);
        return news_list;

    def get_page_news(self, page):
        '''
            獲取指定頁數
            若指定的頁數在可爬取頁數的范圍之外,則拋出運行異常異常
            :param pages:  指定的頁數
            :return: 返回指定頁的新聞列表
        '''
        if page < 1 or page > self.count:
            raise IndexError("page不在指定范圍內");
        else:
            print("\r當前正在%d頁" % page, end="");
            return self.get_news_from_page_url(self.page_url(page));

    def get_news_from_page_url(self, url):
        '''
            獲取指定url的所有新聞列表
            :param pages:  指定的頁面url
            :return:  news_page_list的列表
        '''
        dom_tree = tranfrom_dom_tree(url);
        news_ui = dom_tree.select(".news-list li a");
        news_page_list = [];
        for index in range(0, 10):
            try:
                a_tag = news_ui[index];
                href = a_tag.get("href");

                news = News(href);
                news.summary = a_tag.select(".news-list-description")[0].text;
                news_page_list.append(eval("news.to_"+self.news_type+"()"));
            except Exception:
                error_log = "此頁面不正常: %s"%href;
                print("此頁面不正常: %s"%href, end="");
                with open("./logger.txt", "a", encoding="utf-8") as file:
                    now_time = time.time();
                    file.write(error_log + " 錯誤時間:" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(now_time)) );

        return news_page_list;

    @property
    def news_type(self):
        '''
            新聞默認類型設置
            :return: 返回默認類型
        '''
        return self._news_type;
    @news_type.setter
    def news_type(self, type="dict"):
        '''
            新聞可選類型
            :param type:  選擇的類型
        '''
        if type == "dict":
            self._news_type = type;
        else :
            raise Exception("未能匹配該類型");

測試代碼如下

    #  測試GZCCNewsReptil可用性
    start_page = 106
    # news_list = GZCCNewsReptile().get_news_from_news_page_size(start_page,start_page+10);
    news_list = GZCCNewsReptile().get_news_from_news_page_size(1,  256);
    pandas_date = pandas.DataFrame(news_list);
    print(news_list);
    # #csv
    pandas_date.to_csv(".\pandas_date.csv", encoding="utf-8_sig");
    # #sql
    # with sqlite3.connect("test.sqlite") as db:
    #     pandas_date.to_sql("test", db);
    #     date = pandas_date.read_sql_query('SELECT * FROM gzccnewsdb5', con=db)
    #     date[date["news_times"]>380];

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM