python爬蟲-搜索小說並下載


  1 #coding:utf-8
  2 import requests,os,re
  3 from bs4 import BeautifulSoup
  4 from selenium import webdriver
  5 from selenium.webdriver.chrome.options import Options
  6 from selenium.webdriver.common.keys import Keys
  7 
  8 class downloader():
  9 
 10     def __init__(self):
 11         self.urls = []  # 保存章節鏈接
 12         self.name = []  # 保存章節名
 13         self.url = 'https://so.biqusoso.com/s.php?ie=utf-8&siteid=biqugex.com&q='
 14 
 15     """輸入小說名,搜索"""
 16     def Get_url(self):
 17         #創建chrome參數對象,設置chrome瀏覽器無界面模式
 18         chrome_options = Options()
 19         chrome_options.add_argument('--headless')
 20         # 創建chrome無界面對象
 21         browser = webdriver.Chrome(options=chrome_options)
 22         browser.get(self.url)
 23         c = input('請輸入小說全名:')
 24         browser.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div[2]/form/input[3]').send_keys(c)
 25         browser.find_element_by_xpath('//*[@id="wrapper"]/div[1]/div[2]/form/input[4]').click()
 26         new_url = browser.current_url
 27         # 關閉瀏覽器
 28         browser.close()
 29         # 關閉chromedriver進程
 30         browser.quit()
 31         print("已關閉瀏覽器")
 32         # print(new_url)
 33         response = requests.get(new_url)
 34         response.encoding = 'utf-8'
 35         soup = BeautifulSoup(response.text, 'lxml')
 36         # print(soup)
 37         name1 = soup.find_all('span', class_='s2')
 38         soup = BeautifulSoup(str(name1), 'lxml')
 39         new_name = soup.find('a')
 40         new_name1 = new_name.string
 41         # print(new_name1)
 42         self.href = new_name.attrs['href']
 43         print(self.href)
 44         return self.href
 45     def Response(self):
 46         response = requests.get(self.href)
 47         response.encoding = 'gbk'  # 解決亂碼
 48         self.soup = BeautifulSoup(response.text, 'lxml')  # 解析網頁
 49         div = self.soup.find_all('div', class_='listmain')  # 在解析結果中查找class_='listmain'
 50         soup1 = BeautifulSoup(str(div), 'lxml')  # 刪除字符串頭和尾的空格
 51         h = soup1.find_all('a')  # 在class_='listmain下面找到a標簽
 52         for i in h:
 53             self.name.append(i.string)  # 將a標簽中的非屬性字符,即章節名添加到name
 54             self.urls.append('https://www.biqugex.com%s' % i.get('href'))  # 將a標簽中的鏈接,添加到urls
 55 
 56     def file(self):
 57         """查找小說名字,並創建同名文件夾"""
 58         div1 = self.soup.select('body > div.book > div.info > h2')
 59         a = BeautifulSoup(str(div1), 'lxml')
 60         b = a.find('h2')
 61         b = b.string
 62         c = 'C:\\Users\\Administrator\\Desktop\\%s' % b
 63         if not os.path.exists(c):
 64             os.mkdir(c)
 65 
 66         # 循環解析urls,得到小說正文
 67         i = 0
 68         while i < len(self.urls):
 69             response1 = requests.get(url=self.urls[i])
 70             response1.encoding = 'gbk'
 71             soup2 = BeautifulSoup(response1.text, 'lxml')
 72             d = soup2.find_all('div', id='content')
 73             id1 = BeautifulSoup(str(d), 'lxml')
 74             # 創建文件名
 75             src = self.name[i] + '.txt'
 76             filename = c + '/' + src
 77             print(filename)
 78 
 79             # 將解析到的小說正文寫到文件中
 80             for result in id1:
 81                 res = result.text
 82                 id2 = soup2.select('#content')
 83                 with open(filename, 'w+', encoding='utf-8') as f:
 84                     f.write(res)
 85                 i += 1
 86 #如果輸入的網址不是正確的網址,則提示請輸入正確的筆趣閣網址
 87     def Main(self):
 88         try:
 89             d = downloader()
 90             d.Get_url()
 91         except:
 92             print('沒有找到')
 93         else:
 94             d.Response()
 95             d.file()
 96 
 97 
 98 
 99 if __name__ == '__main__':
100     # url=input('請輸入網址:')
101     # url='https://www.biqugex.com/book_104027/'
102     a = downloader()
103     a.Main()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM