1.前置知識
- html一些知識
- python基本語法
- 簡單的一些爬蟲庫api調用
2.所用到的包
- requests
- bs4 import BeautifulSoup Beautiful Soup 是一個可以從HTML或XML文件中提取數據的Python庫(可以理解為 一個處理文本工具吧)
- os
- sys
https://cn.python-requests.org/zh_CN/latest/
https://beautifulsoup.readthedocs.io/zh_CN/latest/
3.我練習所遇到的問題
- 部分頁面文本get下來 出現中文 亂碼
- request(url, headers,timout) 其中headers 經常出現無效 和 timeout
- 文本寫入文件時出現 gdk 無法寫入
正文
這里我用來練習的網站和內容是 python 下載 https://www.biqukan.com/1_1094/ 一念永恆這章節特別多的小說吧?(這是小說?)
目標是 下載全部的作者發表的全部章節(我選取這本小說是因為 這是我翻了半天最多的)
步驟1
獲得一個頁面返回
user_agent = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)'
]
ers = 1
def get_html(url) :
user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
headers = {'User-Agent' : user_agent}
while 1 :
try :
res = requests.get(url, headers = {'User-Agent':random.choice(user_agent)}, timeout=10)
break
except Exception as e:
global ers
print("第 ", ers, " loop ", e)
ers = ers + 1
continue
print(res.encoding) #查看網頁返回的字符集類型
print(type(res))
return res
這里我用死循環 來處理偶爾出現的 無效的瀏覽頭 timeout
步驟2
分析 既然是獲取全部章節 我首先看到了 這個網站對每一個小說章節直接可以在對應書id目錄 全部看到 連翻頁都不需要… 這樣就簡單了 F12 查看每個章節href 位置 看看如何 find_all()
這里我們發現 只要獲得 div 中class_=‘listmain’ 就完事了 之后放到soup中 find_(‘a’) 就可以獲得全部的 文章名字和鏈接
i.get(’*’) 獲得關鍵字 i.string 獲得它所含區域文本
def get_list(url) :
html = get_html(url)
soup = BeautifulSoup(html.text)
div_list = soup.find_all('div', class_='listmain')
div_url = BeautifulSoup(str(div_list[0]))
dd_list_url = div_url.find_all('a')
url_second = []
base_url = 'https://www.biqukan.com'
for i in dd_list_url :
url_second.append([i.string, base_url + i.get('href')])
# print(i.string, i.get('href'))
return url_second
步驟3
我們get_list返回的 所有文章地址和title 此時我們考慮如何獲得文章
文章內容在 div id=‘content’, class_=‘showtxt’
頁面放入soup中 直接find_all(‘div’ id=‘content’ class_=‘showtxt’)
這里你可能看到這樣的事情
你可能發現 我們encode進行編碼 而且用的是’ISO-8859-1’ (request默認編碼 可能是反爬也可能是網站就是這么寫的) 這樣操作中文就正常了
def get_text(list_url) :
cnt = 1
for i in list_url :
cnt = cnt + 1
print(i[0], i[1])
if(os.path.exists('data/' + i[0] + '.txt')) : continue
html = get_html(i[1])
soup = BeautifulSoup(html.text.encode('ISO-8859-1'))
div_text = soup.find_all('div', id='content', class_='showtxt')
f = open('data/' + i[0] + '.txt', 'w+', encoding='utf-8')
f.write(div_text[0].text)
f.close()
如果沒有encode這一步的話 你看你會看到
上面是直接request的 后面是 .encode(‘ISO-8859-1’)過的
我看頁面已經寫了 是gdk編碼 但是 print(res.encoding) 是 ‘ISO-8859-1’ 這里encode這個編碼算是我亂試出的
以下是全部代碼
#!/usr/bin/env python3
import requests
import time
import random
from bs4 import BeautifulSoup
import os
import sys
user_agent = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Opera/8.0 (Windows NT 5.1; U; en)',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)'
]
ers = 1
def get_html(url) :
user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
headers = {'User-Agent' : user_agent}
while 1 :
try :
res = requests.get(url, headers = {'User-Agent':random.choice(user_agent)}, timeout=10)
break
except Exception as e:
global ers
print("第 ", ers, " loop ", e)
ers = ers + 1
continue
print(res.encoding) #查看網頁返回的字符集類型
print(type(res))
return res
def get_list(url) :
html = get_html(url)
soup = BeautifulSoup(html.text)
div_list = soup.find_all('div', class_='listmain')
div_url = BeautifulSoup(str(div_list[0]))
dd_list_url = div_url.find_all('a')
url_second = []
base_url = 'https://www.biqukan.com'
for i in dd_list_url :
url_second.append([i.string, base_url + i.get('href')])
# print(i.string, i.get('href'))
return url_second
def get_text(list_url) :
cnt = 1
for i in list_url :
cnt = cnt + 1
print(i[0], i[1])
sys.stdout.write(" 進度:%.3f%%" % float(cnt/len(list_url)) + '\r')
sys.stdout.flush()
if(os.path.exists('data/' + i[0] + '.txt')) : continue
html = get_html(i[1])
soup = BeautifulSoup(html.text.encode('ISO-8859-1'))
div_text = soup.find_all('div', id='content', class_='showtxt')
f = open('data/' + i[0] + '.txt', 'w+', encoding='utf-8')
f.write(div_text[0].text)
f.close()
def get_t(url) :
time.sleep(1)
html = get_html(url)
tmp = BeautifulSoup(html.text.encode('ISO-8859-1'))
print(html.text)
print(html.text.encode('ISO-8859-1'))
div_text = tmp.find_all('div', id='content', class_='showtxt')
print(div_text[0].text)
if __name__ == "__main__":
base_url = 'https://www.biqukan.com/1_1094'
novel_list = get_list(base_url)
get_text(novel_list)
# get_html('https://www.biqukan.com/1_1094')
# get_t('https://www.biqukan.com/1_1094/5447905.html')