'''
獲取全書網的任意一本書的正文,每個章節為一個txt文件(如果章節太多可以獲取前5章),這些文件全部放在以書名命名的文件夾中
'''
import os
import re
from urllib.parse import quote
import requests
class QuanShu:
def __init__(self, name=''):
name = quote(name.encode('gb2312'))
# print(name)
self.host = 'http://www.quanshuxs.com/'
self.url = f'http://www.quanshuxs.com/search.asp?key={name}&x=0&y=0'
pass
def get_search_html(self):
resp = requests.get(self.url)
resp.encoding = resp.apparent_encoding
html = resp.text
tables = re.findall(
r'<table cellspacing="0" cellpadding="0" width="962" border="0" align="center" class="m9">(.*?)</table>',
html, re.S)
url_list = []
for i, table in enumerate(tables):
works_url = re.findall(r'<a href="(.*)" target="_blank">', table, re.S)
works_info = re.findall(r'<a href=".*?">(.*?)</a>', table, re.S)
works_status = re.findall(r'狀態: </font>\r\n(.*?) \|', table, re.S)
if len(works_url) < 1:
print("沒有搜索到對應作品!")
break
else:
works_url = works_url[0]
works_name = str(works_info[0]).replace("<font color='red'>", "").replace("</font>", "")
new_chapter = works_info[1]
works_author = works_info[2]
works_type = works_info[3]
works_status = works_status[0]
url_list.append(self.host + works_url)
print(
f'序號:{i:3}作品名稱:{works_name} 最新章節:{new_chapter} 作者: {works_author} 類型: {works_type} 狀態: {works_status}',
end='\n\n')
url_num = input("請選擇需要下載的作品序號(enter):")
self.get_works_html(url_list[int(url_num)])
def get_works_html(self, url):
works_resp = requests.get(url)
works_resp.encoding = works_resp.apparent_encoding
html = works_resp.text
table = re.findall(r'class="mread">(.*?)</table>', html, re.S)[0]
self.title = re.findall(r'<font color="#7B352B">(.*?)全文閱讀</font>', table, re.S)[0]
if not os.path.exists(self.title):
os.mkdir(self.title)
chapters = re.findall(r'<div class="bai"><a href="(.*?)">(.*?)</a>', table, re.S)[0]
# print(chapters)
self.get_chapter_html(chapters[0])
# 訪問單個章節
def get_chapter_html(self, url):
chapter_resp = requests.get(url)
chapter_resp.encoding = chapter_resp.apparent_encoding
html = chapter_resp.text
chapter_name = re.findall(r"<strong>(.*?)</strong>", html, re.S)[0]
print(f'正在下載---->{chapter_name}')
next = re.findall(r"<a href='(.*?)'><font color='#7B352B'>下一章</font></a> \( → \)", html, re.S)
content = re.findall(r'<td colspan="2" class="content">(.*?)</td>', html, re.S)
# print(content)
content = content[0].replace('<br><br>', '\n').replace('<img src="image/', '').replace('.jpg">', ' ').replace(
'—', '—')
with open(f'{self.title}/{chapter_name}.txt', 'w+') as f:
f.write(" ")
f.write(content)
print(f'已下載---->{chapter_name},{url}')
if len(next) > 0:
self.get_chapter_html(next[0])
if __name__ == '__main__':
name = input("請輸入小說名稱:")
quanshu = QuanShu(name)
quanshu.get_search_html()
quanshu.get_works_html()