python爬取指定新聞

本文轉載自查看原文 2019-04-03 20:39 1240

作業的要求來自於：https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/2894

給定一篇新聞的鏈接newsUrl，獲取該新聞的全部信息

　　標題、作者、發布單位、審核、來源

　　發布時間:轉換成datetime類型

　　點擊：

newsUrl
newsId(使用正則表達式re)
clickUrl(str.format(newsId))
requests.get(clickUrl)
newClick(用字符串處理，或正則表達式)
int()

整個過程包裝成一個簡單清晰的函數。

#coding = utf-8;
import re;
import requests;
from datetime import datetime;
from bs4 import BeautifulSoup;

class News(object):
    '''
        廣商校園新聞
    '''
    def __init__(self, url):
        self.url = url;             #新聞網頁地址
        self._dom_tree = self._tranfrom_dom_tree(url);

        self._show_infos = self._dom_tree.select(".show-info")[0].text.split();
        self._update_time = self._show_infos[0][5:] + " " + self._show_infos[1];     # 最后更新時間

    def _tranfrom_dom_tree(self,url):
        '''
            將獲取的html文本轉化為dom樹
        '''
        response = requests.get(url);
        response.encoding = "utf-8";
        return BeautifulSoup(response.text, "html.parser");

    # 新聞標題
    @property
    def title(self):
        return self._dom_tree.select(".show-title")[0].text;

    # 新聞作者
    @property
    def auothor(self):
        return self._show_infos[2][3:];

    # 新聞審核
    @property
    def auditor(self):
        return self._show_infos[3][3:];

    # 新聞發布單位
    @property
    def origin(self):
        self._show_infos[4][3:];

    # 新聞最后更新時間
    @property
    def update_time(self):
        return self._update_time;
    @update_time.setter
    def update_time(self, time):
        self._update_time = time;

    # 點擊次數
    @property
    def times(self):
        clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(self.news_id);
        res = requests.get(clickUrl)
        click = re.findall('(\d+)', res.text)[-1]
        return click

    # 新聞標識
    @property
    def news_id(self):
        time = datetime.strptime(self._update_time, '%Y-%m-%d %H:%M:%S');
        time = time.strftime("%m%d");
        re.match('http://news.gzcc.cn/html/2019/.*/(\d+).html', self.url).group(1)

if __name__ == "__main__":
    html_url = "http://news.gzcc.cn/html/2019/xiaoyuanxinwen_0322/11047.html";
    news = News(html_url);

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python爬蟲使用Xpath爬取指定位置的內容 Python爬取網站新聞 python 圖片爬蟲抓取圖片系列一——爬取指定網頁中的圖片 python爬搜狗微信獲取指定微信公眾號的文章如何利用python爬取網易新聞圖片爬蟲工具，可以爬取指定網頁的圖片 python selenium截取指定元素圖片 python 獲取指定字符前面或后面的所有字符 python獲取指定日期和轉換的整理 python抽取指定url頁面的title方法