# 导入必备的包 # 本文爬取的是顶点小说中的完美世界为列。文中的aa.text,bb.text为自己创建的text文件 import requests from bs4 import BeautifulSoup # 爬取目标url url = 'https://www.x23us.com/html/42/42377/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3393.4 Safari/537.36' } # 单独的函数,用于打开bb文件,bb文件用于存章节的url def open_href(): with open('bb.text', 'r', encoding='utf-8') as f: a = f.readlines() f.close() # 将文件里的所有url读出并以列表的形式返回 return a # 请求目标网址,并返回文本源码 def page_index(): html = requests.get(url, headers=headers) if html: return html.text # 将目标网址进行解析,得到所有章节的url def page_list(html): if html: a = [] html_bs4 = BeautifulSoup(html, 'lxml') html_b = html_bs4.select('.L a') for i in html_b: title = i.get_text() href = url + i.get('href') data = { 'title': title, 'href': href } a.append(data) return a # 将aa文件打开并将得到的章节内容写进文件里并关闭存档 def text_cun_html(title, html): if html: with open('aa.text', 'a+', encoding='utf-8') as f: f.write(title + '\n' + html + '\n') f.close() print('存档成功!!!!') return 'yes' else: None # 将bb文件打开并将写入aa文件相对应的url写进bb文件里并关闭存档 def text_cun_href(href): if href: with open('bb.text', 'a+', encoding='utf-8') as f: f.write(href + '\n') f.close() print('href存档成功!!') return 'ok' else: None # 将得到的章节url解析并二次请求获取章节内容 def html_list_index(title, href): if href: html = requests.get(url=href, headers=headers) if html.status_code == 200: bs = BeautifulSoup(html.text, 'lxml') bs4 = bs.select('#contents') for item in bs4: a = text_cun_html(title, item.get_text()) # 章节内容存档成功返回yes if a == 'yes': text_cun_href(href) else: None def main(): # 首先获取下bb文件的url列表 number = open_href() print(number) html = page_index() data = page_list(html) for i in data: title = i.get('title') href = i.get('href') print(href) # 判断发生异常后,在启动服务时,我们过滤掉已爬取过的url if href + '\n' not in number: html_list_index(title, href) else: None if __name__ == '__main__': main() #简单思路实现断点续存,不喜勿喷,欢迎共同讨论