python3+beautifulSoup4.6抓取某網站小說（三）網頁分析，BeautifulSoup解析

本文轉載自查看原文 2018-04-08 15:41 1566 python3/ beautifulSoup4.6

本章學習內容：將網站上的小說都爬下來，存儲到本地。

目標網站：www.cuiweijuxs.com

分析頁面，發現一共4步：從主頁進入分版打開分頁列表、打開分頁下所有鏈接、打開作品頁面、打開單章內容。

所以實現步驟如下：

1、進入分版頁面，www.cuiweijuxs.com/jingpinxiaoshuo/

找到最大分頁數

<a href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_122.html" class="last">122</a>

循環打開每個頁面

href="http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

2、找到當頁所有鏈接，循環打開單頁鏈接，下為可定位元素

div id="newscontent"
 div class="l"
　　<span class="s2">
 　　<a href="http://www.cuiweijuxs.com/4_4521/" target="_blank">標題</a>

3、打開單頁鏈接，找到章節列表，下為可定位元素

<div id="list">
<dd>
<a href="/4_4508/528170.html">第一章</a>
</dd>
</div>

4、打開單章鏈接，讀取內容

<div id="content">

內容

<div>

setup1：創建class，初始化參數，抽象化獲取beautifulsoup解析后到網頁

# -*- coding: UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import os

'''
使用BeautifulSoup抓取網頁
'''

class Capture():

    def __init__(self):
        self.index_page_url = 'http://www.cuiweijuxs.com/'
        self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
        self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
        self.folder_path = '小說/'
        self.head = {}
        # 寫入User Agent信息
        self.head[
            'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'

    # 獲取BeautifulSoup
    def getSoup(self, query_url):
        req = request.Request(query_url, headers=self.head)
        webpage = request.urlopen(req)
        html = webpage.read()
        #soup = BeautifulSoup(html, 'html.parser')
        soup = BeautifulSoup(html, 'html5lib')
        return soup
        # end getSoup

setup2：創建進入分版頁面，找到最大分頁數，並循環打開每個頁面

# 讀取更新列表
    def readPageOne(self):
        soup = self.getSoup(self.one_page_url)
        last = soup.find("a","last")
        itemSize = int(last.string)
        page_url = str(self.two_page_url)

        for item in range(itemSize):
            print( item )
            new_page_url = page_url.replace( "?",str(item+1) )
            self.readPageTwo(new_page_url)

    # end readPageOne

　　使用getSoup方法獲取解析后到html網頁，使用find方法找到class是“last”的a標簽，獲取最大分頁數

　　循環分頁，從1開始

setup3：讀取單頁鏈接

#讀取單頁鏈接
def readPageTwo(self,page_url):
    soup = self.getSoup(page_url)
    con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
    a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
    print(a_list)
    for a_href in a_list:
        #print(child)
        href = a_href.get('href')
        folder_name = a_href.get_text()
        print('a_href',href,'---folder_name',folder_name)
        path = self.folder_path + folder_name
        self.createFolder(path)
        self.readPageThree(href,path)
        # end for

# end readPageTwo

　　找到div下id是newscontent的標簽，再往下找到class是“l”的div，再找到所有class是“s2”的span，找到此span下的a標簽，循環打開a標簽

並找到標簽名（ a_href.get_text() ）作為文件夾名稱

setup4：打開作品頁面，循環章節鏈接，拼接文件名稱

   #打開作品頁面
    def readPageThree(self,page_url,path):
        soup = self.getSoup(page_url)
        print('readPageThree--',page_url)
        a_list = soup.find('div', {'id': 'list'}).find_all('a')
        idx = 0
        for a_href in a_list:
            idx = idx+1
            href = self.index_page_url +  a_href.get('href')
            txt_name =   path + '/' +  str(idx) + '_'+ a_href.get_text()  + '.txt'
            print('a_href', href, '---path', txt_name)
            isExists = os.path.exists(txt_name)
            if isExists:
                print(txt_name, '已存在')
            else:
                self.readPageFour(href,txt_name)

setup5：打開章節鏈接，讀取id=content的div下所有內容，寫入文件中

 #讀取單章內容並寫入
    def readPageFour(self,page_url,path):
        soup = self.getSoup(page_url)
        con_div = soup.find('div', {'id': 'content'})
        content = con_div.get_text().replace('<br/>', '\n').replace(' ', ' ')
        self.writeTxt(path,content)

完整代碼實現如下：

  1 # -*- coding: UTF-8 -*-
  2 from urllib import request
  3 from bs4 import BeautifulSoup
  4 import os
  5 
  6 '''
  7 使用BeautifulSoup抓取網頁
  8 '''
  9 
 10 class Capture():
 11 
 12     def __init__(self):
 13         self.index_page_url = 'http://www.cuiweijuxs.com/'
 14         self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
 15         self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
 16         self.folder_path = '小說/'
 17         self.head = {}
 18         # 寫入User Agent信息
 19         self.head[
 20             'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
 21 
 22     # 獲取BeautifulSoup
 23     def getSoup(self, query_url):
 24         req = request.Request(query_url, headers=self.head)
 25         webpage = request.urlopen(req)
 26         html = webpage.read()
 27         #soup = BeautifulSoup(html, 'html.parser')
 28         soup = BeautifulSoup(html, 'html5lib')
 29         return soup
 30         # end getSoup
 31 
 32     #讀取更新列表
 33     def readPageOne(self):
 34         soup = self.getSoup(self.one_page_url)
 35         last = soup.find("a","last")
 36         itemSize = int(last.string)
 37         page_url = str(self.two_page_url)
 38 
 39         for item in range(itemSize):
 40             print( item )
 41             new_page_url = page_url.replace( "?",str(item+1) )
 42             self.readPageTwo(new_page_url)
 43 
 44         # end readPageOne
 45 
 46     #讀取單頁鏈接
 47     def readPageTwo(self,page_url):
 48         soup = self.getSoup(page_url)
 49         con_div = soup.find('div',{'id':'newscontent'}).find('div',{'class':'l'})
 50         a_list = con_div.find_all('span',{'class':'s2'})[0].find_all('a')
 51         print(a_list)
 52         for a_href in a_list:
 53             #print(child)
 54             href = a_href.get('href')
 55             folder_name = a_href.get_text()
 56             print('a_href',href,'---folder_name',folder_name)
 57             path = self.folder_path + folder_name
 58             self.createFolder(path)
 59             self.readPageThree(href,path)
 60             # end for
 61 
 62         # end readPage
 63 
 64     #打開單章鏈接
 65     def readPageThree(self,page_url,path):
 66         soup = self.getSoup(page_url)
 67         print('readPageThree--',page_url)
 68         a_list = soup.find('div', {'id': 'list'}).find_all('a')
 69         idx = 0
 70         for a_href in a_list:
 71             idx = idx+1
 72             href = self.index_page_url +  a_href.get('href')
 73             txt_name =   path + '/' +  str(idx) + '_'+ a_href.get_text()  + '.txt'
 74             print('a_href', href, '---path', txt_name)
 75             isExists = os.path.exists(txt_name)
 76             if isExists:
 77                 print(txt_name, '已存在')
 78             else:
 79                 self.readPageFour(href,txt_name)
 80 
 81 
 82     #讀取單章內容並寫入
 83     def readPageFour(self,page_url,path):
 84         soup = self.getSoup(page_url)
 85         con_div = soup.find('div', {'id': 'content'})
 86         content = con_div.get_text().replace('<br/>', '\n').replace('&nbsp;', ' ')
 87         self.writeTxt(path,content)
 88 
 89     def readPageHtml(self,page_url,path):
 90         soup = self.getSoup(page_url)
 91         con_div = soup.find('div', {'id': 'content'})
 92         content = con_div.get_text().replace('<br/>', '\n').replace('&nbsp;', ' ')
 93 
 94 
 95     def createFolder(self,path):
 96         path = path.strip()
 97         # 去除尾部 \ 符號
 98         path = path.rstrip("\\")
 99         isExists = os.path.exists(path)
100         # 不存在則創建
101         if not isExists:
102             os.makedirs(path)
103             print(path + ' create')
104         else:
105             print( path + ' 目錄已存在')
106         #end createFolder
107 
108     def writeTxt(self,file_name,content):
109         isExists = os.path.exists(file_name)
110         if isExists:
111             print(file_name,'已存在')
112         else:
113             file_object = open(file_name, 'w',encoding='utf-8')
114             file_object.write(content)
115             file_object.close()
116 
117     def run(self):
118         try:
119             self.readPageOne()
120         except BaseException as error:
121             print('error--',error)
122 
123 
124 Capture().run()

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 抓取分析網頁批量下載評書(1)之搜索有聲小說 python 抓取小說網站，制作電子書。 Python【BeautifulSoup解析和提取網頁數據】 python爬蟲學習基礎之網頁解析(2)BeautifulSoup python爬蟲抓取小說--練習 python3用BeautifulSoup抓取a標簽抓取一個網站全部的網頁URL--Python、爬蟲使用python抓取並分析數據—鏈家網(requests+BeautifulSoup)（轉） python3用BeautifulSoup抓取div標簽爬蟲基礎：BeautifulSoup網頁解析庫