一.爬蟲基礎
1.1 requests類
1.1.1 request的7個方法
requests.request() 實例化一個對象,擁有以下方法
requests.get(url, *args)
requests.head() 頭信息
requests.post()
requests.put()
requests.patch() 修改一部分內容
requests.delete()
url = "http://quanben5.com/n/doupocangqiong/6.html" headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"
} r = requests.get(url, headers=headers) data = { "pinyin": "doupocangqiong", "content_id": "4", } r = requests.post(url, data=data, headers=headers)
1.1.2*arg里面的參數
params 字典或者字節序列,作為參數增加到url中
data 字典字節序列文件對象, 放在url里面對應的地方
json 作為requests的內容
headers 字典 模擬服務頭
cookies 字典 cookieJar
auth 元組
files 字典類型,傳輸文件
timeout 設定的超時時間
proxies 字典類型,設定訪問代理服務器,可以增加登錄認證 pxs={"http":"http://user:pass@10.10.1.1234"}
allow_redirects 默認True 是否允許重定向
stream 默認TRUE 獲得數據立刻下載
verity 默認True 認證SSL證書開關
cert 本地SSL證書路徑
1.2 BeautifulSoup類
soup = BeautifulSoup("","html.parser")
soup.prettify()
soup.find_all(name, attrs, recursive, string, **kwargs)
name: 標簽名稱
attrs: 屬性
string: 檢索字符串
soup.head
soup.head.contents 列表
soup.body.contents[1]
soup.body.children
soup.body.descendants
.parent
.parents
.next_sibling 下一個
.previous_sibling 上一個
.next_siblings 下一個所有 迭代
.previous_siblings 上一個所有
1.3 selenium
1 from selenium import webdriver 2 from selenium.webdriver.chrome.options import Options 3 import time 4 5 6 chrome_options = Options() 7 chrome_options.add_argument('--headless') 8 chrome_options.add_argument('--disable-gpu') 9 10 d = webdriver.Chrome(chrome_options=chrome_options) # 設置成不顯示瀏覽器的模式 11 d.get('http://quanben5.com/n/dazhuzai/23241.html') 12 time.sleep(2) 13 print(d.page_source) # 顯示出的是加載完之后的內容
查找單個和多個元素
d.find_element_by_id
d.find_elements_by_id
元素交互
from selenium import webdriver d = webdriver.Chrome() d.get('http://www.baidu.com') inputText = d.find_element_by_id("kw") inputText.send_keys("蘿莉") button = d.find_element_by_id("su") button.click()
二.實戰
首先要找到可以在線看小說的網頁
這里我隨便百度了一下,首先選擇了一個全本5200小說網("https://www.qb5200.tw")
打開某小說章節目錄表
https://www.qb5200.tw/xiaoshuo/0/357/
查看源代碼
發現正文卷是在class為listmain的div下面的第二個dt標簽里
之后的路徑標簽為a

1 url_text = soup.find_all('div', 'listmain')[0] # 找到第一個class=listmain的div標簽 2 main_text = url_text.find_all('dt')[1] # 找到下面的第二個dt標簽 3 for tag in main_text.next_siblings: 4 try: 5 url = ''.join(['https://', host, tag.a.attrs['href']]) 6 print('parsering', url) 7 except: 8 continue
在隨便打開一章
查看源代碼為:

1 def getHTMLText(url, headers={}): 2 """ 3 獲取網站源碼 4 :param url: 5 :return: class response 6 """ 7 if headers != {}: 8 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"} 9 10 # proxies = get_random_ip() 11 proxies = {} 12 try: 13 # print("start", url) 14 r = requests.get(url, proxies=proxies, headers=headers) 15 r.raise_for_status() 16 # r.encoding = r.apparent_encoding 17 # print('end', url) 18 return r 19 except: 20 return r.status_code 21 22 def parseByQB5200(soup, host, f): 23 """ 24 在全本小說網的源碼下爬取小說 25 :param soup, host: 26 :param f: 27 :return: 28 """ 29 url_text = soup.find_all('div', 'listmain')[0] 30 main_text = url_text.find_all('dt')[1] 31 x = 0 32 for tag in main_text.next_siblings: 33 time.sleep(1) 34 try: 35 url = ''.join(['https://', host, tag.a.attrs['href']]) 36 print('parsering', url) 37 soup = BeautifulSoup(getHTMLText(url).text, "html.parser") 38 passage = soup.find_all("div", "content")[0] 39 title = passage.h1.string 40 f.writelines(''.join([title, '\n'])) 41 passage = soup.find_all("div", "showtxt")[0] 42 for i in passage.descendants: 43 if i.name != "br": 44 st = i.string 45 if st.strip().startswith('http') or st.strip().startswith('請記住本書'): 46 continue 47 f.writelines(''.join([' ', st.strip(), '\n'])) 48 x += 1 49 print('%d.%s 下載完成' % (x, title)) 50 except: 51 continue 52 53 def getNovelUrls(url): 54 """ 55 通過小說的目錄網址判斷小說所在的網站 56 並調用屬於該網站的爬蟲語句 57 :param url: 58 :return: 59 """ 60 61 response = getHTMLText(url) 62 host = url.split('//')[1].split('/')[0] 63 host_list = { 64 "www.qb5200.tw": parseByQB5200, 65 # "www.qu.la": parseByQuLa, 66 "quanben5.com": parseByQB5 67 } 68 print(response) 69 soup = BeautifulSoup(response.text, 'html.parser') 70 with open('1.txt', 'w', encoding='utf8') as f: 71 host_list[host](soup, host, f) 72 73 if __name__ == '__main__': 74 getNovelUrls("https://www.qb5200.tw/xiaoshuo/0/357/")
問題在於
全本小說網("www.qb5200.tw")在這樣的暴力獲取下只允許爬3次,之后就403錯誤
本來以為是同一IP限制訪問次數, 使用了IP代理之后發現問題依舊
猜測應該是跟請求頭有關
因此加上了訪問該網站時瀏覽器的所有請求頭,並且把User-Agent設置為隨機

USER_AGENT_LIST = [ 'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23', 'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)', 'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)', 'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)', 'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)', 'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0', 'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)', 'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)' ] headers = {"User-Agent": random.choice(USER_AGENT_LIST), "Host": "www.qb5200.tw", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Referer": "https://www.qb5200.tw/xiaoshuo/0/355/", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "UM_distinctid=16736c47b1d8b-04bd85d2b5e8ab-50422618-144000-16736c47b1e1f2; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260750615=1497982366-1542811927-%7C1542882690; fikker-7ZUK-qXIL=qDaKEBihatSNEFgtMlGVIKubCHYLi89J; fikker-7ZUK-qXIL=qDaKEBihatSNEFgtMlGVIKubCHYLi89J; fikker-Sbwr-GN9F=GbMX3HOhwvoDMxopLMGt3VWliXQIK0SP; fikker-Sbwr-GN9F=GbMX3HOhwvoDMxopLMGt3VWliXQIK0SP; fikker-yJ3O-W61D=UfinETMnCR38ADWZEl1KNHQRU2m81Fwb; fikker-yJ3O-W61D=UfinETMnCR38ADWZEl1KNHQRU2m81Fwb; fikker-rWK3-6KHs=T9T5b7lYVJReTQviPm2IdLPyHu83RwFM; fikker-rWK3-6KHs=T9T5b7lYVJReTQviPm2IdLPyHu83RwFM" }
解決問題.
三.使用ajax動態加載的實例:
全本5小說網("quanben5.com")
同樣的方法搜索源代碼
然而發現了問題
給出的html頁面只有一半的源碼
因此按F12打開檢查
發現所有的文本存在這個xhr里
點擊查看請求頭信息
發現是post請求
請求的url是/index.php?c=book&a=ajax_content
請求的數據在最下面的form表單里
打開網頁源文件和js文件,搜索這些表單信息
分別在ajax.js里和源文件里找到了這些
源文件里面的可以直接生成data數據表單
在ajax.js里可以知道rndval字段是當前時間,精確到毫秒
四.優化
采用了gvent進行異步IO處理,每一張網頁保存在temp里面,最后將文件合成一個txt
加入了搜索功能,目前僅支持一個小說網站
代碼如下:

1 #!/usr/bin/env python 2 # encoding: utf-8 3 4 """ 5 @version: 6 @author: Wish Chen 7 @contact: 986859110@qq.com 8 @file: get_novels.py 9 @time: 2018/11/21 19:43 10 11 """ 12 import gevent 13 from gevent import monkey 14 monkey.patch_all() 15 import requests, time, random, re, os 16 from bs4 import BeautifulSoup 17 18 dir = os.path.dirname(os.path.abspath(__file__)) 19 20 def getHTMLText(url, data=[], method='get'): 21 """ 22 獲取網站的源代碼 23 請求默認為get 24 :param url: 25 :param data: 26 :param method: 27 :return: 28 """ 29 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"} 30 proxies = {} 31 try: 32 # print("start", url) 33 r = requests.request(method, url, proxies=proxies, headers=headers, data=data) 34 r.raise_for_status() 35 # r.encoding = r.apparent_encoding 36 # print('end', url) 37 return r 38 except: 39 return r.status_code 40 41 42 def fetch_async(x, url): 43 """ 44 異步IO所需要執行的操作 45 獲取源文件 46 模擬向ajax請求獲取完整文字 47 每一章輸入到temp文件夾下 48 :param x: 49 :param url: 50 :return: 51 """ 52 url_main = "http://quanben5.com/index.php?c=book&a=ajax_content" 53 r = getHTMLText(url) # 獲取每一章的源文件 54 title = re.search(r'<h1 class="title1">(.*)</h1>', r.text).group(1) 55 result = re.search(r'<script type="text/javascript">ajax_post\((.*)\)</script>', r.text).group(1) 56 num_list = result.split("','") 57 num_list[9] = num_list[9][:-1] 58 content = {} # 開始模擬post請求發送的表單 59 for i in range(1, 5): 60 content[num_list[i * 2]] = num_list[i * 2 + 1] 61 content['_type'] = "ajax" 62 content['rndval'] = int(time.time() * 1000) 63 r = getHTMLText(url_main, data=content, method='post') # 模擬post請求 64 soup = BeautifulSoup(r.text, "lxml") 65 with open(os.path.join(dir, 'temp', "%s.txt" % x), "w", encoding='utf8') as f: 66 f.writelines(''.join([str(x), '.', title, '\n\n'])) 67 for tag in soup.body.children: 68 if tag.name == 'p': 69 f.writelines(''.join([' ', tag.string.strip(), '\n\n'])) 70 print('%d.%s 下載完成' % (x, title)) 71 72 73 def get_together(name, author, x): 74 """ 75 將temp目錄下的各網頁下載下來的txt 76 合並在一起 77 並刪除temp文件 78 :param name: 79 :param author: 80 :return: 81 """ 82 with open(os.path.join(dir, "%s.txt" % name), "w", encoding='utf8') as f: 83 f.writelines(''.join([name, '\n\n作者:', author, '\n\n'])) 84 85 for i in range(x): 86 try: 87 f.write(open(os.path.join(dir, 'temp', "%s.txt" % (i+1)), "r", encoding='utf8').read()) 88 f.write('\n\n') 89 # os.remove(os.path.join(dir, 'temp', "%s.txt" % (i+1))) 90 except: 91 continue 92 93 94 def parseByQB5(response, host): 95 """ 96 在全本5小說網的源碼下爬取小說 97 獲得書名和作者 98 采用gevent異步IO優化 99 :param response: 100 :param host: 101 :return: 102 """ 103 soup = BeautifulSoup(response.text, 'html.parser') 104 url_text = soup.find_all('div', 'box')[2] 105 main_text = url_text.find_all('h2')[0].next_sibling 106 url_list = [] 107 for tag in main_text.descendants: 108 if tag.name == 'li': 109 url = ''.join(['http://', host, tag.a.attrs['href']]) 110 url_list.append(url) 111 from gevent.pool import Pool 112 pool = Pool(100) 113 114 gevent.joinall([pool.spawn(fetch_async, i+1, url=url_list[i]) for i in range(len(url_list))]) 115 116 name = re.search(r"<h3><span>(.*)</span></h3>", response.text).group(1) 117 author = re.search(r'<span class="author">(.*)</span></p>', response.text).group(1) 118 print("%d文檔已下載,正在合並..." % len(url_list)) 119 get_together(name, author, len(url_list)) 120 121 122 def getNovelUrls(url): 123 """ 124 通過小說的目錄網址判斷小說所在的網站 125 並調用屬於該網站的爬蟲語句 126 :param url: 127 :return: 128 """ 129 130 response = getHTMLText(url) 131 host = url.split('//')[1].split('/')[0] 132 host_list = { 133 "quanben5.com": parseByQB5 134 } 135 host_list[host](response, host) 136 137 138 def get_url(): 139 input_name = input('>>') 140 r = getHTMLText("http://quanben5.com//index.php?c=book&a=search&keywords=%s" % input_name) 141 soup = BeautifulSoup(r.text, "html.parser") 142 main_book = soup.find_all("div", "pic_txt_list") 143 for i in range(len(main_book)): 144 tag = main_book[i].h3 145 print("%s.%s %s" %(i, tag.span.text, tag.next_sibling.next_sibling.text)) 146 choice = int(input(">>")) 147 if choice in range(len(main_book)): 148 return ''.join(["http://quanben5.com", main_book[choice].h3.a["href"], "xiaoshuo.html"]) 149 150 151 if __name__ == '__main__': 152 # url_list = [ 153 # "https://www.qu.la/book/365/", 154 # "https://www.qb5200.tw/xiaoshuo/0/357/", 155 # "http://quanben5.com/n/doupocangqiong/xiaoshuo.html", 156 # "http://quanben5.com/n/dazhuzai/xiaoshuo.html", 157 # "http://quanben5.com/n/douluodalu/xiaoshuo.html", 158 # "http://quanben5.com/n/renxingderuodian/xiaoshuo.html" 159 # ] 160 # if not os.path.exists('temp'): 161 # os.mkdir('temp') 162 # getNovelUrls(url_list[5]) 163 while True: 164 url = get_url() 165 time_start = time.time() 166 getNovelUrls(url) 167 print("成功爬取! 用時:%ds" % (int((time.time()-time_start)*100)/100))
封裝性還不夠好
最好一個網站用一個類來封裝
采用scrapy框架
正在設計中...
五.未解決問題:
代理IP問題:
目前只有免費的代理IP網
生成隨機IP並使用代理IP訪問
多網站定制
要觀察各個網站的目錄源代碼結構以及文章源代碼結構
每一個網站都可以用一個parse函數來解析其內容