笔趣阁有很多站点,本次爬取的网址为:https://www.vipxs.la/
因为本人最近在看一世之尊,因此想尝试在笔趣阁上爬取全本。
首先,小说一世之尊页面的网址为:"https://www.vipxs.la/0_740/"
从该页面中可以找到各个章节对应的url,需要将其保存在一个列表中,通过遍历爬取全部章节。
通过F12调试界面可以看到,其html代码,url均在<div id=list> 中, 或者在<a href="xxx"> 中,抛弃首尾的冗余,可以得到目标章节url。
通过观察内容页面,发现爬取很简单,文字都在<div id=content> 中,奈何遇到了一个困难。
问题一:爬取文字过程中发现print出来仅有最后一行,其余文字均消失。猜测如下:1.网页反爬程序,返回假页面。2.文字由js代码生成,并不是静态页面。 3.编解码格式问题
猜测一:构建user-agent、cookies、referer,更换ip等操作均无效,只好暂时放弃
猜测二:通过f12观察network中 xml 数据,并没有发现目标数据。再通过禁用网页JavaScript,发现文字仍然显示。显示网页源码,也能直接观测到文字。种种迹象表明,猜测二错误
猜测三:通过.find("div", id="content").text 获取内容,发现文字成功显示
再就是格式调整,爬取的内容中含有一些广告、多余换行。通过split()可以快速去除首尾空格、换行符
通过多线程加快爬取速度,但是爬取速度过快的话会导致重连超时...
import time import requests from bs4 import BeautifulSoup import random import threading import os url = "https://www.vipxs.la/0_740/" headers = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)" ] referer = "https://www.vipxs.la/" cookies = {'Cookie':"UM_distinctid=17f533be31115db-00017a4b760e46-977173c-144000-17f533be312c6c; Hm_lvt_8744b58bc1913cae0d8c4dc68f187d61=1646368908,1646368915,1646450962; CNZZDATA1280571925=488337212-1646363111-https%3A%2F%2Fwww.baidu.com%2F|1646449511; coupletAlllength=5; CNZZDATA1280571999=226664088-1646363299-https%3A%2F%2Fwww.baidu.com%2F|1646449699; Hm_lvt_b48494e860b198c9c71009978cfc755e=1646368908,1646368915,1646450962; fixedalllength=9; Hm_lvt_2d2ceac9af7f7f1a8dbdd51db6dbf36c=1646368908,1646368915,1646450962; 5531_2603_27.38.254.113=1; CNZZDATA1280572003=1348579129-1646364283-https%3A%2F%2Fwww.baidu.com%2F|1646450689; fixedall1length=8; CNZZDATA1280572006=1040752295-1646363481-https%3A%2F%2Fwww.baidu.com%2F|1646449881; Hm_lvt_dd3a5d36b1adfd567e4b8290c0760ba3=1646368908,1646368915,1646450963; clickbids=740; Hm_lvt_4d0a92fe9eb4da3973f356b734b334b6=1646368908,1646368915,1646450963; img3002500length=6; 5531_2570_27.38.254.113=1; Hm_lvt_4ad6b1a6d9755b262a181c469db16477=1646368913,1646450973; 5531_2444_27.38.254.113=1; 5531_2409_27.38.254.113=1; 5531_2403_27.38.254.113=1; CNZZDATA1280572013=42055633-1646365418-https%3A%2F%2Fwww.vipxs.la%2F|1646451818; 5531_2334_27.38.254.113=1; Hm_lpvt_4d0a92fe9eb4da3973f356b734b334b6=1646452229; 5531_2578_27.38.254.113=1; 5531_2563_27.38.254.113=1; coupletAll=1_0_4_3; fixedall=8_3_0_1_7; img3002500=1; fixedall1=6_2_5_7; richviews_5531=cRE9U1a3frz1iDNhc0K7SiahoOmhFed824EmDGllfAcca2YveADIUUZ4RaxXDzxli%2FHutkjPerP9wyrRHpug%2Fk%2B%2FXdViyzcaXEypaCEzuSyrbR9rvqKz9%2B81xBsynM6omYQw9eI3x0PEJ%2FmAv2AsKOY21ere%2Bf4rafFzUUOPSOxxXLwHf95U1sXNnYeOhr9bO8C3j36sy1MkcP77Qh9gspMwrZ4H0%2BfU6rnQPrHZ6CK1hXCb3tiIf6xo6FBRjO%2FgqIO%2FHDGk%2B1CM818cVCaBZ9Fs2LSVVUS7O%2Fa2SrNL7cJPFab2Bk%2FdLithl3nVy4MBs%2B4zlOoKCBlJgo7%2FgZ81Jo%2Bm9L%2BXWpWErQB%2FSEXRAoUVYIQ6TruK8dqMZPqQCUVJHqUtXDu0NCqW2r0KinusY8Rc5tlzdayjPWF%2F7yNEwsGb0LVYWk4Q9Atf4lHmt14iY9b4O0MLPZwckbtZ4IIY7SbW5yOn%2FHtyaJS0EvjOpW%2B7KS%2FVZ4LfxkwzbquJANRA7nHhVOMkUt9ldFOqcIaZB67%2BPHDwub0o4cfyKyi%2BaU2jOkmnnKxpRwFAjQEVF0Dd5m6T0xUCN9SL04vmT%2FQEHg47z0NyL9txUFInfFU7qhlGzFUKpoTbqzAogzKRVn1N%2BItSh1Atqcme8eLqzr%2BTw1grq7Dkbn9f52e47o%2FEl38%3D; Hm_lpvt_8744b58bc1913cae0d8c4dc68f187d61=1646452466; Hm_lpvt_b48494e860b198c9c71009978cfc755e=1646452466; Hm_lpvt_2d2ceac9af7f7f1a8dbdd51db6dbf36c=1646452466; Hm_lpvt_dd3a5d36b1adfd567e4b8290c0760ba3=1646452466; Hm_lpvt_4ad6b1a6d9755b262a181c469db16477=1646452466"} chapter_list = [] res = requests.get(url, headers={"User-Agent": random.choice(headers), "Referer": referer}) bsobj = BeautifulSoup(res.content, 'lxml') # 将网页源码构造成BeautifulSoup对象,方便操作 temp = bsobj.find_all("a") for i in temp[35:-10]: chapter_list.append(i.get('href')) def pachong(chapter_list,name): for i in chapter_list: url = "https://www.vipxs.la" + i max_retry = 15 for n in range(max_retry): try: time.sleep(random.randint(1, 10)) res2 = requests.get(url, headers={'User-Agent': random.choice(headers)}, cookies=cookies, timeout=15) if res2.status_code == 200: break except: print(f"connect error, retry times : {n+1}") if n+1 == max_retry: print("重连次数过多,自动结束程序") exit() bsobj2 = BeautifulSoup(res2.content, 'lxml') # 将网页源码构造成BeautifulSoup对象,方便操作 content = bsobj2.find("div", id="content").text title = bsobj2.find("h1").text temp = content.split() with open(f"{name}.txt", "a", encoding="utf-8") as f: f.write("-------" + title + "-------" + "\r\n") for j in temp[:-2]: f.write(" " + j + "\r\n") print(f"完成<<{title}>>的爬取!") print("完成!!") t_list = [] threading_num = 11 for i in range(threading_num): book_num = threading_num -1 temp = int(len(chapter_list)/book_num) * (i+1) if temp <= len(chapter_list): t = threading.Thread(target=pachong, args=(chapter_list[temp - int(len(chapter_list) / book_num):temp], f"一世之尊{i+1}")) else: t = threading.Thread(target=pachong, args=(chapter_list[temp - int(len(chapter_list) / book_num):], f"一世之尊{i+1}")) t_list.append(t) t.start() for t in t_list: t.join() def check(threading_num, true_length): count = 0 for i in range(threading_num): with open(f"一世之尊{i + 1}.txt", "r", encoding="utf-8") as f: for j in f: if "-----" in j: count += 1 return true_length == count def combine(threading_num): for i in range(threading_num - 1): with open(f"一世之尊.txt", "w", encoding="utf-8") as f: with open(f"一世之尊{i+1}.txt", "r", encoding="utf-8") as f2: f.write(f2.read()) os.remove(f"一世之尊{i+1}.txt") if check(threading_num,len(chapter_list)): print("校验成功,下载无错误!") combine(threading_num)