爬蟲批量下載全站小說並自動保存

本文轉載自查看原文 2018-05-15 14:21 1121 3-爬蟲爬一個/ 爬蟲

目的是下載一個網站所有分類的小說，並且按照分類自動創建目錄、根據小說名保存為txt文件。

一、抓取思路：

　　我的思路是百度一個小說網站，找到一本小說的章節頁，使用requests、BeautifulSoup測試是否能

正常下載。如果能正常下載，進行下一步。

二、操作順序：

1.導入模塊，指定網頁請求頭：

from bs4 import BeautifulSoup
import requests
import time
import os
import random

my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"

    ]

headers = {
    'User-Agent':random.choice(my_headers)#隨機選取模擬瀏覽器
}

　 2.獲取一本小說的章節頁，並寫入指定路徑：

#url = 'http://www.fhxiaoshuo.com/read/3/3414/6127874.shtml'
    data = requests.get(url,headers=headers)
    time.sleep(2)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')#注意.encode('ISO-8859-1').decode('GB18030')的用法

text = soup.select('div.zhangjieTXT')[0].text title2 = soup.select('div.zhangjieming > h1')[0].text 
ls = [] for i in text: 
　　if i in "'\r','ads_wz_2();','\xa0','\t','\n','“','\t','■', '◆', 'n', '■', '◆', 'h', 'u', '■', '◆', 'b', '，', '∧', 'n', '♀', '※',": 
　　　　continue 
　　ls.append(i) 
text =''.join(ls) print('正在下載{}'.format(title2)) 
with open('.\\books\\' + 'title1' + '.txt','ab+') as f: 
　　f.write((title2 + '\r\n').encode()) #寫入標題 
　　f.write(text.encode())#寫入正文 
　　f.write('\r\n\r\n'.encode())#寫入換行

3.獲取一本小說的全部章節鏈接：

def get_urls(url,fenlei_title):
    #url = 'http://www.fhxiaoshuo.com/read/3/3414/'
    data = requests.get(url,headers=headers)
    time.sleep(2)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    title1 = soup.select('div#maininfo > div > h1')[0].text
    if not os.path.exists('.\\books\\' + fenlei_title + '\\'+ title1):
        os.mkdir('.\\books\\' + fenlei_title + '\\'+ title1)
    links = soup.select('div#list > dl')
    print("正在下載{}".format(title1))
    #ls = []
    for i in links:
        data = i.select('dd > a')
        time.sleep(2)
        for m in data:
            url = m.get('href')
            #ls.append(ls)
            get_text(url,title1,fenlei_title)

4.獲取一個分類，比如武俠類的全部小說：

def get_list(url,fenlei_title):
    #url = 'http://www.fhxiaoshuo.com/sort/5/1/'
    data = requests.get(url,headers=headers)
    time.sleep(1)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    links = soup.select('div#alist')
    for i in links:
        data = i.select('div.info > div.title > h2 > a')
        for m in data:
            url = m.get('href')
            time.sleep(3)
            title = m.text
            get_urls(url,fenlei_title)

5.獲取首頁全部分類鏈接：

def get_fenlei():
    url = 'http://www.fhxiaoshuo.com/'
    data = requests.get(url,headers=headers)
    time.sleep(0.5)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    links = soup.select('div.nav1 > ul')
    for i in links:
        data = i.select('li > a')
        for m in data:
            url = m.get('href')
            time.sleep(1)
            fenlei_title = m.text
            if not os.path.exists('.\\books\\' + fenlei_title):
                os.mkdir('.\\books\\' + fenlei_title)
                get_list(url, fenlei_title)

三、全部代碼如下（使用time.sleep()保障網頁請求）：

#!/usr/bin/env python
# -*- coding:utf-8 -*- 
#Author: ss


from bs4 import BeautifulSoup
import requests
import time
import os
import random

my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"

    ]

headers = {
    'User-Agent':random.choice(my_headers)
}

def get_text(url,title1,fenlei_title):
    #url = 'http://www.fhxiaoshuo.com/read/3/3414/6127874.shtml'
    data = requests.get(url,headers=headers)
    time.sleep(2)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    text = soup.select('div.zhangjieTXT')[0].text
    title2 = soup.select('div.zhangjieming > h1')[0].text
    ls = []
    for i in text:
        if i in "'\r','ads_wz_2();','\xa0','\t','\n','“','\t','■', '◆', 'n', '■', '◆', 'h', 'u', '■', '◆', 'b', '，', '∧', 'n', '♀', '※',":
            continue
        ls.append(i)
    text =''.join(ls)
    print('正在下載{}'.format(title2))
    with open('.\\books\\' + fenlei_title + '\\' +title1 +'\\' + title1 + '.txt','ab+') as f:
        f.write((title2 + '\r\n').encode())
        f.write(text.encode())
        f.write('\r\n\r\n'.encode())


def get_urls(url,fenlei_title):
    #url = 'http://www.fhxiaoshuo.com/read/3/3414/'
    data = requests.get(url,headers=headers)
    time.sleep(2)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    title1 = soup.select('div#maininfo > div > h1')[0].text
    if not os.path.exists('.\\books\\' + fenlei_title + '\\'+ title1):
        os.mkdir('.\\books\\' + fenlei_title + '\\'+ title1)
    links = soup.select('div#list > dl')
    print("正在下載{}".format(title1))
    #ls = []
    for i in links:
        data = i.select('dd > a')
        time.sleep(2)
        for m in data:
            url = m.get('href')
            #ls.append(ls)
            get_text(url,title1,fenlei_title)

def get_list(url,fenlei_title):
    #url = 'http://www.fhxiaoshuo.com/sort/5/1/'
    data = requests.get(url,headers=headers)
    time.sleep(1)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    links = soup.select('div#alist')
    for i in links:
        data = i.select('div.info > div.title > h2 > a')
        for m in data:
            url = m.get('href')
            time.sleep(3)
            title = m.text
            get_urls(url,fenlei_title)

def get_fenlei():
    url = 'http://www.fhxiaoshuo.com/'
    data = requests.get(url,headers=headers)
    time.sleep(0.5)
    soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('GB18030'),'lxml')
    links = soup.select('div.nav1 > ul')
    for i in links:
        data = i.select('li > a')
        for m in data:
            url = m.get('href')
            time.sleep(1)
            fenlei_title = m.text
            if not os.path.exists('.\\books\\' + fenlei_title):
                os.mkdir('.\\books\\' + fenlei_title)
                get_list(url, fenlei_title)

get_fenlei()

asd

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬蟲批量自動下載小說批量下載小說網站上的小說（python爬蟲） python爬蟲筆記（八）實例3：用Python批量爬取全站小說【以書趣閣為例】【爬蟲】對新筆趣閣小說進行爬取，保存和下載爬蟲實踐---排行榜小說批量下載 Node 爬蟲，批量下載並保存圖片純golang爬蟲實戰(三）－批量下載保存圖片從“頂點小說”下載完整小說——python爬蟲 python爬蟲-搜索小說並下載 python爬蟲分章節保存小說